diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..4204303 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +SOURCES/oprofile-0.9.9.tar.gz diff --git a/.oprofile.metadata b/.oprofile.metadata new file mode 100644 index 0000000..c91d9ef --- /dev/null +++ b/.oprofile.metadata @@ -0,0 +1 @@ +02a1f6609affb04a348dbddfdf8f03e66154f5be SOURCES/oprofile-0.9.9.tar.gz diff --git a/README.md b/README.md deleted file mode 100644 index 0e7897f..0000000 --- a/README.md +++ /dev/null @@ -1,5 +0,0 @@ -The master branch has no content - -Look at the c7 branch if you are working with CentOS-7, or the c4/c5/c6 branch for CentOS-4, 5 or 6 - -If you find this file in a distro specific branch, it means that no content has been checked in yet diff --git a/SOURCES/oprofile-0.4-guess2.patch b/SOURCES/oprofile-0.4-guess2.patch new file mode 100644 index 0000000..e956676 --- /dev/null +++ b/SOURCES/oprofile-0.4-guess2.patch @@ -0,0 +1,12 @@ +--- oprofile-0.4/gui/oprof_start_config.cpp.guess2 2002-10-01 18:32:31.000000000 -0400 ++++ oprofile-0.4/gui/oprof_start_config.cpp 2002-12-06 14:45:19.000000000 -0500 +@@ -98,8 +98,7 @@ + perror("oprof_start: Unable to determine OS release."); + } else { + string const version(info.release); +- string const vmlinux_path("/lib/modules/" + version +- + "/build/vmlinux"); ++ std::string const vmlinux_path("/usr/lib/debug/lib/modules/" + version + "/vmlinux"); + kernel_filename = vmlinux_path; + } + } diff --git a/SOURCES/oprofile-0.9.7-xen.patch b/SOURCES/oprofile-0.9.7-xen.patch new file mode 100644 index 0000000..0b2b241 --- /dev/null +++ b/SOURCES/oprofile-0.9.7-xen.patch @@ -0,0 +1,707 @@ +diff -up oprofile-0.9.7/daemon/init.c.xen oprofile-0.9.7/daemon/init.c +--- oprofile-0.9.7/daemon/init.c.xen 2011-07-04 22:25:04.000000000 -0400 ++++ oprofile-0.9.7/daemon/init.c 2011-11-28 16:25:07.577000010 -0500 +@@ -312,6 +312,8 @@ static void opd_26_init(void) + + opd_create_vmlinux(vmlinux, kernel_range); + opd_create_xen(xenimage, xen_range); ++ if (xen_passive_setup) ++ opd_create_passive(xen_passive_setup); + + opd_buf_size = opd_read_fs_int("/dev/oprofile/", "buffer_size", 1); + kernel_pointer_size = opd_read_fs_int("/dev/oprofile/", "pointer_size", 1); +diff -up oprofile-0.9.7/daemon/opd_kernel.c.xen oprofile-0.9.7/daemon/opd_kernel.c +--- oprofile-0.9.7/daemon/opd_kernel.c.xen 2011-07-04 22:25:04.000000000 -0400 ++++ oprofile-0.9.7/daemon/opd_kernel.c 2011-11-28 16:25:07.579000010 -0500 +@@ -34,11 +34,22 @@ static struct kernel_image vmlinux_image + + static struct kernel_image xen_image; + ++static struct kernel_image xen_image_anon; ++static struct kernel_image vmlinux_image_anon; ++ ++static LIST_HEAD(passive_vmlinux); ++static LIST_HEAD(passive_xen); ++static LIST_HEAD(passive_apps); ++static LIST_HEAD(passive_modules); ++static LIST_HEAD(passive_xen_anon); ++ + void opd_create_vmlinux(char const * name, char const * arg) + { + /* vmlinux is *not* on the list of modules */ + list_init(&vmlinux_image.list); + ++ list_init(&vmlinux_image_anon.list); ++ + /* for no vmlinux */ + if (no_vmlinux) { + vmlinux_image.name = "no-vmlinux"; +@@ -57,13 +68,22 @@ void opd_create_vmlinux(char const * nam + vmlinux_image.start, vmlinux_image.end); + exit(EXIT_FAILURE); + } ++ ++ vmlinux_image_anon.name = "vmlinux-unknown"; ++ vmlinux_image_anon.start = vmlinux_image.start; ++ vmlinux_image_anon.end = vmlinux_image.end; ++ + } + + void opd_create_xen(char const * name, char const * arg) + { ++ int stat; ++ + /* xen is *not* on the list of modules */ + list_init(&xen_image.list); + ++ list_init(&xen_image_anon.list); ++ + /* for no xen */ + if (no_xen) { + xen_image.name = "no-xen"; +@@ -72,18 +92,106 @@ void opd_create_xen(char const * name, c + + xen_image.name = xstrdup(name); + +- sscanf(arg, "%llx,%llx", &xen_image.start, &xen_image.end); ++ stat = sscanf(arg, "%llx,%llx", &xen_image.start, &xen_image.end); ++ ++ xen_image_anon.name = "xen-unknown"; ++ xen_image_anon.start = xen_image.start; ++ xen_image_anon.end = xen_image.end; + + verbprintf(vmisc, "xen_start = %llx, xen_end = %llx\n", + xen_image.start, xen_image.end); + +- if (!xen_image.start && !xen_image.end) { ++ if ( stat != 2 ) { + fprintf(stderr, "error: mis-parsed xen range: %llx-%llx\n", + xen_image.start, xen_image.end); + exit(EXIT_FAILURE); + } ++ + } + ++void opd_create_passive_domain(int id, char const * image_kernel, ++ char const * range, char const * image_xen) ++{ ++ char file[64]; ++ struct kernel_image * image; ++ int stat; ++ ++ image = xmalloc(sizeof(struct kernel_image)); ++ image->name = xstrdup(image_kernel); ++ image->start = image->end = 0; ++ stat = sscanf(range, "%llx,%llx", &image->start, &image->end); ++ image->id = id; ++ list_add(&image->list, &passive_vmlinux); ++ ++ if ( stat != 2 ) { ++ fprintf(stderr, "error: mis-parsed passive domain range for " ++ "domain %d: %llx-%llx\n", id, image->start, image->end); ++ exit(EXIT_FAILURE); ++ } ++ ++ image = xmalloc(sizeof(struct kernel_image)); ++ image->name = xstrdup(image_xen); ++ image->start = xen_image.start; ++ image->end = xen_image.end; ++ image->id = id; ++ list_add(&image->list, &passive_xen); ++ ++ sprintf(file, "domain%d-apps", id); ++ image = xmalloc(sizeof(struct kernel_image)); ++ image->name = xstrdup(file); ++ image->start = 0; ++ image->end = 0; ++ image->id = id; ++ list_add(&image->list, &passive_apps); ++ ++ sprintf(file, "domain%d-modules", id); ++ image = xmalloc(sizeof(struct kernel_image)); ++ image->name = xstrdup(file); ++ image->start = 0; ++ image->end = 0; ++ stat = sscanf(range, "%llx,%llx", &image->start, &image->end); ++ image->id = id; ++ list_add(&image->list, &passive_modules); ++ ++ sprintf(file, "domain%d-xen-unknown", id); ++ image = xmalloc(sizeof(struct kernel_image)); ++ image->name = xstrdup(file); ++ image->start = xen_image.start; ++ image->end = xen_image.end; ++ image->id = id; ++ list_add(&image->list, &passive_xen_anon); ++ ++} ++ ++void opd_create_passive(char const *setup_file) ++{ ++ FILE *fp; ++ int id=0; ++ char image_kernel[128+1]; ++ char range[128+1]; ++ char image_xen[128+1]; ++ int stat; ++ ++ image_kernel[0] = range[0] = image_xen[0] = 0; ++ ++ fp = fopen(setup_file, "r"); ++ ++ if (!fp) { ++ fprintf(stderr, "error: Could not open Xen passive domain " ++ "setup file %s\n", setup_file); ++ exit(EXIT_FAILURE); ++ } ++ ++ while (1) { ++ stat = fscanf(fp, "%d %128s %128s %128s", &id, image_kernel, range, ++ image_xen); ++ if ( stat != 4 ) ++ return; ++ opd_create_passive_domain(id, image_kernel, range, image_xen); ++ } ++ ++ fclose(fp); ++} + + /** + * Allocate and initialise a kernel image description +@@ -210,6 +318,75 @@ struct kernel_image * find_kernel_image( + struct list_head * pos; + struct kernel_image * image = &vmlinux_image; + ++ if (current_domain != COORDINATOR_DOMAIN) { ++ /* we rely on cpu_mode value (i.e. trans->in_kernel) ++ * to search the right image type: xen, kernel or user ++ * We cannot use address ranges since hypervisor does not ++ * share the same address space with fully virtualized guests, ++ * and thus address ranges can overlap */ ++ switch ( trans->in_kernel ) { ++ ++ /* user mode */ ++ case 1: ++ list_for_each(pos, &passive_apps) { ++ image = list_entry(pos, struct kernel_image, list); ++ if (image->id == current_domain) ++ return image; ++ } ++ return NULL; ++ ++ /* kernel mode */ ++ case 2: ++ list_for_each(pos, &passive_vmlinux) { ++ image = list_entry(pos, struct kernel_image, list); ++ if ( (image->id == current_domain) ++ && ( (image->start == 0 && image->end == 0) ++ || (image->start <= trans->pc ++ && image->end > trans->pc) ) ) ++ return image; ++ } ++ /* if not in kernel image range then it should be a module */ ++ list_for_each(pos, &passive_modules) { ++ image = list_entry(pos, struct kernel_image, list); ++ if (image->id == current_domain) ++ return image; ++ } ++ /* This should not happen if the kernel and user level ++ oprofile code are sane and in sync */ ++ return NULL; ++ ++ /* hypervisor mode */ ++ case 3: ++ list_for_each(pos, &passive_xen) { ++ image = list_entry(pos, struct kernel_image, list); ++ if (image->id == current_domain ++ && image->start <= trans->pc ++ && image->end > trans->pc) ++ return image; ++ } ++ list_for_each(pos, &passive_xen_anon) { ++ image = list_entry(pos, struct kernel_image, list); ++ if (image->id == current_domain) ++ return image; ++ } ++ return NULL; ++ ++ default: ++ printf("Unexpected error on passive mode: CPU mode is " ++ "%d for domain %d\n", trans->in_kernel, current_domain); ++ return NULL; ++ } ++ ++ ++ } ++ ++ if (xen_image.start <= trans->pc && xen_image.end > trans->pc) ++ return &xen_image; ++ ++ if (trans->in_kernel == 2) { ++ return &xen_image_anon; ++ } ++ + if (no_vmlinux) + return image; + +@@ -222,8 +399,5 @@ struct kernel_image * find_kernel_image( + return image; + } + +- if (xen_image.start <= trans->pc && xen_image.end > trans->pc) +- return &xen_image; +- +- return NULL; ++ return &vmlinux_image_anon; + } +diff -up oprofile-0.9.7/daemon/opd_kernel.h.xen oprofile-0.9.7/daemon/opd_kernel.h +--- oprofile-0.9.7/daemon/opd_kernel.h.xen 2011-07-04 22:25:04.000000000 -0400 ++++ oprofile-0.9.7/daemon/opd_kernel.h 2011-11-28 16:25:07.580000010 -0500 +@@ -23,8 +23,12 @@ struct transient; + /** create the kernel image */ + void opd_create_vmlinux(char const * name, char const * arg); + ++/** create Xen image */ + void opd_create_xen(char const * name, char const * arg); + ++/** create Xen passive domain images */ ++void opd_create_passive(char const *setup_file); ++ + /** opd_reread_module_info - parse /proc/modules for kernel modules */ + void opd_reread_module_info(void); + +@@ -33,6 +37,7 @@ struct kernel_image { + char * name; + vma_t start; + vma_t end; ++ int id; + struct list_head list; + }; + +diff -up oprofile-0.9.7/daemon/opd_sfile.c.xen oprofile-0.9.7/daemon/opd_sfile.c +--- oprofile-0.9.7/daemon/opd_sfile.c.xen 2011-07-04 22:25:04.000000000 -0400 ++++ oprofile-0.9.7/daemon/opd_sfile.c 2011-11-28 16:25:07.582000010 -0500 +@@ -240,7 +240,7 @@ struct sfile * sfile_find(struct transie + } + + /* we might need a kernel image start/end to hash on */ +- if (trans->in_kernel) { ++ else if (trans->in_kernel) { + ki = find_kernel_image(trans); + if (!ki) { + verbprintf(vsamples, "Lost kernel sample %llx\n", trans->pc); +diff -up oprofile-0.9.7/daemon/opd_trans.c.xen oprofile-0.9.7/daemon/opd_trans.c +--- oprofile-0.9.7/daemon/opd_trans.c.xen 2011-07-04 22:25:04.000000000 -0400 ++++ oprofile-0.9.7/daemon/opd_trans.c 2011-11-28 16:25:07.584000010 -0500 +@@ -31,6 +31,8 @@ + #include + #include + ++int32_t current_domain = COORDINATOR_DOMAIN; ++ + extern size_t kernel_pointer_size; + + +@@ -203,6 +205,9 @@ static void code_kernel_enter(struct tra + { + verbprintf(vmisc, "KERNEL_ENTER_SWITCH to kernel\n"); + trans->in_kernel = 1; ++ /* if in passive domain mode cpu mode should be incremented */ ++ if (current_domain != COORDINATOR_DOMAIN) ++ trans->in_kernel++; + clear_trans_current(trans); + /* subtlety: we must keep trans->cookie cached, + * even though it's meaningless for the kernel - +@@ -216,6 +221,9 @@ static void code_user_enter(struct trans + { + verbprintf(vmisc, "USER_ENTER_SWITCH to user-space\n"); + trans->in_kernel = 0; ++ /* if in passive domain mode cpu mode should be incremented */ ++ if (current_domain != COORDINATOR_DOMAIN) ++ trans->in_kernel++; + clear_trans_current(trans); + clear_trans_last(trans); + } +@@ -244,17 +252,34 @@ static void code_trace_begin(struct tran + static void code_xen_enter(struct transient * trans) + { + verbprintf(vmisc, "XEN_ENTER_SWITCH to xen\n"); +- trans->in_kernel = 1; ++ trans->in_kernel = 2; ++ /* if in passive domain mode cpu mode should be incremented */ ++ if (current_domain != COORDINATOR_DOMAIN) ++ trans->in_kernel++; + trans->current = NULL; + /* subtlety: we must keep trans->cookie cached, even though it's +- * meaningless for Xen - we won't necessarily get a cookie switch +- * on Xen exit. See comments in opd_sfile.c. It seems that we can +- * get away with in_kernel = 1 as long as we supply the correct +- * Xen image, and its address range in startup find_kernel_image +- * is modified to look in the Xen image also +- */ ++ * meaningless for Xen - same reason as for kernel */ + } + ++static void code_domain_switch(struct transient *trans) ++{ ++ /* While processing passive domain samples we ensure (in_kernel!=0) ++ * We do this in order to ignore cookies for passive domain samples ++ * But, we have to remember the kernel value for coordinator domain, ++ * so we do the safe thing: increment when leaving the coordinator ++ * domain and decrement when returning to it ++ */ ++ if (current_domain == COORDINATOR_DOMAIN) ++ trans->in_kernel++; ++ ++ trans->current = NULL; ++ current_domain = (int32_t) pop_buffer_value(trans); ++ ++ /* If returning to coordinator domain restore the kernel value */ ++ if (current_domain == COORDINATOR_DOMAIN) ++ trans->in_kernel--; ++} ++ + extern void code_spu_profiling(struct transient * trans); + extern void code_spu_ctx_switch(struct transient * trans); + +@@ -278,7 +303,7 @@ handler_t handlers[LAST_CODE + 1] = { + &code_spu_profiling, + &code_spu_ctx_switch, + #else +- &code_unknown, ++ &code_domain_switch, + &code_unknown, + #endif + &code_ibs_fetch_sample, +diff -up oprofile-0.9.7/daemon/opd_trans.h.xen oprofile-0.9.7/daemon/opd_trans.h +--- oprofile-0.9.7/daemon/opd_trans.h.xen 2011-07-04 22:25:04.000000000 -0400 ++++ oprofile-0.9.7/daemon/opd_trans.h 2011-11-28 16:25:07.585000010 -0500 +@@ -21,6 +21,10 @@ + + #include + ++#define COORDINATOR_DOMAIN -1 ++ ++extern int32_t current_domain; ++ + struct sfile; + struct anon_mapping; + +diff -up oprofile-0.9.7/daemon/oprofiled.c.xen oprofile-0.9.7/daemon/oprofiled.c +--- oprofile-0.9.7/daemon/oprofiled.c.xen 2011-07-04 22:25:04.000000000 -0400 ++++ oprofile-0.9.7/daemon/oprofiled.c 2011-11-28 16:25:07.587000010 -0500 +@@ -71,6 +71,7 @@ char * session_dir; + int no_xen; + char * xenimage; + char * xen_range; ++char * xen_passive_setup; + static char * verbose; + static char * binary_name_filter; + static char * events; +@@ -91,6 +92,7 @@ static struct poptOption options[] = { + { "xen-range", 0, POPT_ARG_STRING, &xen_range, 0, "Xen VMA range", "start-end", }, + { "xen-image", 0, POPT_ARG_STRING, &xenimage, 0, "Xen image", "file", }, + { "image", 0, POPT_ARG_STRING, &binary_name_filter, 0, "image name filter", "profile these comma separated image" }, ++ { "xen-passive-setup", 0, POPT_ARG_STRING, &xen_passive_setup, 0, "Xen passive domain setup file", "filename", }, + { "separate-lib", 0, POPT_ARG_INT, &separate_lib, 0, "separate library samples for each distinct application", "[0|1]", }, + { "separate-kernel", 0, POPT_ARG_INT, &separate_kernel, 0, "separate kernel samples for each distinct application", "[0|1]", }, + { "separate-thread", 0, POPT_ARG_INT, &separate_thread, 0, "thread-profiling mode", "[0|1]" }, +diff -up oprofile-0.9.7/daemon/oprofiled.h.xen oprofile-0.9.7/daemon/oprofiled.h +--- oprofile-0.9.7/daemon/oprofiled.h.xen 2011-07-04 22:25:04.000000000 -0400 ++++ oprofile-0.9.7/daemon/oprofiled.h 2011-11-28 16:25:07.588000010 -0500 +@@ -65,5 +65,6 @@ extern char * kernel_range; + extern int no_xen; + extern char * xenimage; + extern char * xen_range; ++extern char * xen_passive_setup; + + #endif /* OPROFILED_H */ +diff -up oprofile-0.9.7/doc/opcontrol.1.in.xen oprofile-0.9.7/doc/opcontrol.1.in +--- oprofile-0.9.7/doc/opcontrol.1.in.xen 2011-07-04 22:25:04.000000000 -0400 ++++ oprofile-0.9.7/doc/opcontrol.1.in 2011-11-28 16:25:07.590000010 -0500 +@@ -158,12 +158,41 @@ Xen image + .br + .TP + .BI "--active-domains=" +-List of domain ids participating in a multi-domain profiling session. If ++List of domain ids participating in a multi-domain profiling session. ++Each of the specified domains must run an instance of oprofile. The ++sequence of opcontrol commands in each domain must follow a given ++order which is specified in the oprofile user manual. If + more than one domain is specified in they should be separated using + commas. This option can only be used in domain 0 which is the only domain + that can coordinate a multi-domain profiling session. Including domain 0 in + the list of active domains is optional. (e.g. --active-domains=2,5,6 and +---active-domains=0,2,5,6 are equivalent) ++--active-domains=0,2,5,6 are equivalent). ++This option can only be specified ++if --start-daemon is also specified and it is only ++valid for the current run of the oprofile daemon; e.g. the list ++of active domains is not persistent. ++.br ++.TP ++.BI "--passive-domains=" or "--domains=" ++List of domain ids to be profiled, separated by commas. ++As opposed to the --active-domains option, the domains specified with this ++option do not need to run oprofile. This makes ++profiling multiple domains easier. However, with the passive-domains option, ++samples in user level processes and kernel modules cannot be ++mapped to specific symbols and are aggregated ++under a generic class. Both --active-domains and --passive-domains ++options can be specified in the same command, but the same domain cannot be ++specified in both options. This option can only be specified if either --start ++or --start-daemon is specified on the same command and it is only valid for ++the current run of the oprofile daemon; e.g. the list of passive domains is ++not persistent. ++.br ++.TP ++.BI "--passive-images=" or "--domains-images=" ++List of kernel images associated with the domains specified in the ++--passive-domains option, also separated by commas. The association ++between the images and domains is based on the order they are ++specified in both options. + .br + .SH OPTIONS (specific to System z) + .TP +diff -up oprofile-0.9.7/libpp/format_output.cpp.xen oprofile-0.9.7/libpp/format_output.cpp +--- oprofile-0.9.7/libpp/format_output.cpp.xen 2011-07-04 22:25:04.000000000 -0400 ++++ oprofile-0.9.7/libpp/format_output.cpp 2011-11-28 16:25:07.592000010 -0500 +@@ -287,8 +287,8 @@ string formatter::format_app_name(field_ + { + return get_image_name(f.symbol.app_name, + long_filenames +- ? image_name_storage::int_real_filename +- : image_name_storage::int_real_basename, ++ ? image_name_storage::int_filename ++ : image_name_storage::int_basename, + extra_found_images); + } + +diff -up oprofile-0.9.7/utils/opcontrol.xen oprofile-0.9.7/utils/opcontrol +--- oprofile-0.9.7/utils/opcontrol.xen 2011-07-20 15:36:48.000000000 -0400 ++++ oprofile-0.9.7/utils/opcontrol 2011-11-28 16:28:56.431000248 -0500 +@@ -236,9 +236,16 @@ opcontrol: usage: + buffer-size. + --cpu-buffer-size=num per-cpu buffer size in units (2.6 kernel) + Same rules as defined for buffer-size. +- --xen Xen image (for Xen only) +- --active-domains= List of domains in profiling session (for Xen) +- (list contains domain ids separated by commas) ++ --xen=file Xen image (for Xen only) ++ --active-domains=id[,ids] list of domains in multiple domain profiling session (Xen) ++ (detailed profiling of user level and kernel modules code) ++ (requires running oprofile on these domains) ++ --passive-domains=id[,ids] list of domains to be profiled (Xen). ++ or --domains=id[,ids] (coarse profiling of user level and kernel modules code) ++ (no need to run oprofile on these domains) ++ --passive-images=file[,files] list of kernel images associated with each passive domain ++ or ++ --domain-images=file[,files] + + System z specific options + +@@ -388,6 +395,9 @@ do_init() + SETUP_FILE="$SETUP_DIR/daemonrc" + SEC_SETUP_FILE="$SETUP_DIR/daemonrc_new" + ++ # location for passing info about passive domains to daemon ++ PASSIVE_SETUP_FILE="$SETUP_DIR/xendomain.setup" ++ + # initialize daemon vars + decide_oprofile_device_mount + CPUTYPE=`cat $MOUNT/cpu_type` +@@ -539,7 +549,7 @@ do_load_setup() + } + + +-check_valid_args() ++check_valid_vmlinux() + { + if test -z "$VMLINUX"; then + echo "No vmlinux file specified. You must specify the correct vmlinux file, e.g." >&2 +@@ -560,8 +570,12 @@ check_valid_args() + + echo "The specified vmlinux file \"$VMLINUX\" doesn't exist." >&2 + exit 1 ++} ++ + + # similar check for Xen image ++check_valid_xen() ++{ + if test -f "$XENIMAGE"; then + return + fi +@@ -622,6 +636,77 @@ get_image_range() + } + + ++set_passive_domain() ++{ ++ DOMAIN_ID=$1 ++ FILE_IMAGE=$2 ++ XEN_IMAGE=$3 ++ ++ if test "$FILE_IMAGE" = "none"; then ++ RANGE="0,0" ++ FILE_IMAGE="domain$DOMAIN_ID-kernel" ++ else ++ # Find VMA range for passive domain kernel image ++ range_info=`objdump -h $FILE_IMAGE 2>/dev/null | grep " .text "` ++ tmp1=`echo $range_info | awk '{print $4}'` ++ tmp_length=`echo $range_info | awk '{print $3}'` ++ tmp2=`objdump -h $FILE_IMAGE --adjust-vma=0x$tmp_length 2>/dev/null | grep " .text " | awk '{print $4}'` ++ ++ if test -z "$tmp1" -o -z "$tmp2"; then ++ echo "The specified file $FILE_IMAGE does not seem to be valid" >&2 ++ echo "Make sure you are using the non-compressed image file (e.g. vmlinux not vmlinuz)" >&2 ++ vecho "found start as \"$tmp1\", end as \"$tmp2\"" >&2 ++ exit 1 ++ fi ++ RANGE="`echo $tmp1`,`echo $tmp2`" ++ fi ++ echo " $DOMAIN_ID $FILE_IMAGE $RANGE $XEN_IMAGE" >> $PASSIVE_SETUP_FILE ++} ++ ++ ++set_passive_domain_config() ++{ ++ ++ create_dir "$SETUP_DIR" ++ ++ touch $PASSIVE_SETUP_FILE ++ chmod 644 $PASSIVE_SETUP_FILE ++ >$PASSIVE_SETUP_FILE ++ ++ NDOMAINS=`echo "$PASSIVE_DOMAINS" | awk -F',' '{print NF}'` ++ ++ if test -n "$PASSIVE_IMAGES"; then ++ NIMAGES=`echo "$PASSIVE_IMAGES" | awk -F',' '{print NF}'` ++ if [ $NDOMAINS != $NIMAGES ]; then ++ echo "# of passive domains and # of passive images doesn't match." >&2 ++ do_help ++ exit 1 ++ fi ++ ++ for (( i=1; i<=$NDOMAINS; i++ )); do ++ ID=`echo "$PASSIVE_DOMAINS" | awk -F"," '{print $'$i'}'` ++ FILE=`echo "$PASSIVE_IMAGES" | awk -F',' '{print $'$i'}'` ++ if test ! -f "$FILE"; then ++ echo "Image $FILE for passive domain $ID not found." >&2 ++ return 1 ++ fi ++ LNK_KERNEL=/boot/domain$ID-kernel ++ ln -sf $FILE $LNK_KERNEL ++ LNK_XEN=/boot/domain$ID-xen ++ ln -sf $XENIMAGE $LNK_XEN ++ set_passive_domain $ID $LNK_KERNEL $LNK_XEN ++ done ++ else ++ for (( i=1; i<=$NDOMAINS; i++ )); do ++ ID=`echo "$PASSIVE_DOMAINS" | awk -F"," '{print $'$i'}'` ++ LNK_XEN=/boot/domain$ID-xen ++ set_passive_domain $ID none $LNK_XEN ++ done ++ ++ fi ++} ++ ++ + # validate --separate= parameters. This function is called with IFS=, + # so on each argument is splitted + validate_separate_args() +@@ -932,10 +1017,20 @@ do_options() + DO_SETUP=yes + ;; + --active-domains) +- error_if_invalid_arg $arg $val ++ error_if_invalid_arg "$arg" "$val" + ACTIVE_DOMAINS=$val + DO_SETUP=yes + ;; ++ --passive-domains|--domains) ++ error_if_invalid_arg "$arg" "$val" ++ PASSIVE_DOMAINS=$val ++ DO_SETUP=yes ++ ;; ++ --passive-images|--domain-images) ++ error_if_invalid_arg "$arg" "$val" ++ PASSIVE_IMAGES=$val ++ DO_SETUP=yes ++ ;; + -i|--image) + error_if_invalid_arg "$arg" "$val" + if test "$val" = "all"; then +@@ -1366,6 +1461,16 @@ check_event_mapping_data() + exit 1 + fi + fi ++ ++ if test -n "$ACTIVE_DOMAINS" -a "$START_DAEMON" != "yes"; then ++ echo "Option \"--active-domains\" can only be used with option \"-start-daemon\"." >&2 ++ exit 1 ++ fi ++ ++ if test -n "$PASSIVE_DOMAINS" -a "$START_DAEMON" != "yes" -a "$START" != "yes"; then ++ echo "Option \"--passive-domains\" or "--domains" can only be used with option \"--start-daemon\" or \"--start\"." >&2 ++ exit 1 ++ fi + } + + +@@ -1404,6 +1509,15 @@ do_param_setup() + fi + fi + ++ if test -n "$PASSIVE_DOMAINS"; then ++ if test "$KERNEL_SUPPORT" = "yes"; then ++ echo $PASSIVE_DOMAINS >$MOUNT/passive_domains ++ set_passive_domain_config ++ else ++ echo "passive-domains not supported - ignored" >&2 ++ fi ++ fi ++ + if test $NOTE_SIZE != 0; then + set_param notesize $NOTE_SIZE + fi +@@ -1566,7 +1680,8 @@ do_start_daemon() + fi + + do_setup +- check_valid_args ++ check_valid_vmlinux ++ check_valid_xen + get_image_range "linux" + get_image_range "xen" + do_param_setup +@@ -1600,6 +1715,10 @@ do_start_daemon() + OPD_ARGS="$OPD_ARGS --image=$IMAGE_FILTER" + fi + ++ if ! test -z "$PASSIVE_DOMAINS"; then ++ OPD_ARGS="$OPD_ARGS --xen-passive-setup=$PASSIVE_SETUP_FILE" ++ fi ++ + if test -n "$VERBOSE"; then + OPD_ARGS="$OPD_ARGS --verbose=$VERBOSE" + fi +@@ -1805,6 +1924,8 @@ do_save_session() + fi + + hup_daemon ++ ++ rm -f /boot/domain-*-kernel /boot/domain-*-xen + } + + +@@ -1855,7 +1976,8 @@ do_operations() + fi + + if test "$SETUP" = "yes"; then +- check_valid_args ++ check_valid_vmlinux ++ check_valid_xen + do_save_setup + fi + diff --git a/SOURCES/oprofile-aarch64.patch b/SOURCES/oprofile-aarch64.patch new file mode 100644 index 0000000..fec25f6 --- /dev/null +++ b/SOURCES/oprofile-aarch64.patch @@ -0,0 +1,677 @@ +commit 34d0065a1a790fc2be05a5ef1d8b0bbf28b814fe +Author: William Cohen +Date: Wed Feb 12 08:05:38 2014 -0600 + + Provide basic AArch64 (ARMv8) support + + The AArch64 (ARMv8) support is provided as an ARM variant to allow use + in both 32-bit and 64-bit ARM environments. The support in this patch + is just the basic events described in the AArch64 documentation. + AArch64 processor implementation may provide additional implementation + specific events. One could add code to recognize those processor + specific implementations and include the armv8-pmuv3-common base + events into the event sets for the processor implementations. + The APM X-Gene processor type is included in this patch as an + implementation, although there are no known processor-specific events + to add at this time. + + Below is example run on the ARM Foundation simulator collecting data + on a build of OProfile. + + $ cd oprofile + $ operf make + ... + $ opreport -t 5 + Using /home/wcohen/oprofile/oprofile/oprofile_data/samples/ for samples directory. + + WARNING: Lost samples detected! See /home/wcohen/oprofile/oprofile/oprofile_data/samples/operf.log for details. + CPU: ARM AArch64 + Counted CPU_CYCLES events (Cycle) with a unit mask of 0x00 (No unit mask) count 100000 + CPU_CYCLES:100000| + samples| %| + ------------------ + 10943 90.5877 make + CPU_CYCLES:100000| + samples| %| + ------------------ + 5281 48.2592 make + 4543 41.5151 libc-2.17.so + 1079 9.8602 kallsyms + 40 0.3655 ld-2.17.so + 735 6.0844 sh + CPU_CYCLES:100000| + samples| %| + ------------------ + 321 43.6735 kallsyms + 298 40.5442 libc-2.17.so + 94 12.7891 bash + 22 2.9932 ld-2.17.so + + Signed-off-by: William Cohen + +diff --git a/events/Makefile.am b/events/Makefile.am +index ad45642..3e43d10 100644 +--- a/events/Makefile.am ++++ b/events/Makefile.am +@@ -59,6 +59,8 @@ event_files = \ + arm/armv7-ca7/events arm/armv7-ca7/unit_masks \ + arm/armv7-ca15/events arm/armv7-ca15/unit_masks \ + arm/mpcore/events arm/mpcore/unit_masks \ ++ arm/armv8-pmuv3-common/events arm/armv8-pmuv3-common/unit_masks \ ++ arm/armv8-xgene/events arm/armv8-xgene/unit_masks \ + avr32/events avr32/unit_masks \ + mips/20K/events mips/20K/unit_masks \ + mips/24K/events mips/24K/unit_masks \ +diff --git a/events/arm/armv8-pmuv3-common/events b/events/arm/armv8-pmuv3-common/events +new file mode 100644 +index 0000000..3cdff03 +--- /dev/null ++++ b/events/arm/armv8-pmuv3-common/events +@@ -0,0 +1,38 @@ ++# ++# Copyright (c) Red Hat, 2014. ++# Contributed by William Cohen ++# ++# ARMv8 pmu v3 architected events ++ ++event:0x00 um:zero minimum:500 name:SW_INCR : Instruction architecturally executed, condition code check pass, software increment ++event:0x01 um:zero minimum:5000 name:L1I_CACHE_REFILL : Level 1 instruction cache refill ++event:0x02 um:zero minimum:5000 name:L1I_TLB_REFILL : Level 1 instruction TLB refill ++event:0x03 um:zero minimum:5000 name:L1D_CACHE_REFILL : Level 1 data cache refill ++event:0x04 um:zero minimum:5000 name:L1D_CACHE : Level 1 data cache access ++event:0x05 um:zero minimum:5000 name:L1D_TLB_REFILL : Level 1 data TLB refill ++event:0x06 um:zero minimum:100000 name:LD_RETIRED : Instruction architecturally executed, condition code check pass, load ++event:0x07 um:zero minimum:100000 name:ST_RETIRED : Instruction architecturally executed, condition code check pass, store ++event:0x08 um:zero minimum:100000 name:INST_RETIRED : Instruction architecturally executed ++event:0x09 um:zero minimum:500 name:EXC_TAKEN : Exception taken ++event:0x0A um:zero minimum:500 name:EXC_RETURN : Instruction architecturally executed, condition code check pass, exception return ++event:0x0B um:zero minimum:500 name:CID_WRITE_RETIRED : Instruction architecturally executed, condition code check pass, write to CONTEXTIDR ++event:0x0C um:zero minimum:5000 name:PC_WRITE_RETIRED : Instruction architecturally executed, condition code check pass, software change of the PC ++event:0x0D um:zero minimum:5000 name:BR_IMMED_RETIRED : Instruction architecturally executed, immediate branch ++event:0x0E um:zero minimum:5000 name:BR_RETURN_RETIRED : Instruction architecturally executed, condition code check pass, procedure return ++event:0x0F um:zero minimum:500 name:UNALIGNED_LDST_RETIRED : Instruction architecturally executed, condition code check pass, unaligned load or store ++event:0x10 um:zero minimum:5000 name:BR_MIS_PRED : Mispredicted or not predicted branch speculatively executed ++event:0x11 um:zero minimum:100000 name:CPU_CYCLES : Cycle ++event:0x12 um:zero minimum:5000 name:BR_PRED : Predictable branch speculatively executed ++event:0x13 um:zero minimum:100000 name:MEM_ACCESS : Data memory access ++event:0x14 um:zero minimum:5000 name:L1I_CACHE : Level 1 instruction cache access ++event:0x15 um:zero minimum:5000 name:L1D_CACHE_WB : Level 1 data cache write-back ++event:0x16 um:zero minimum:5000 name:L2D_CACHE : Level 2 data cache access ++event:0x17 um:zero minimum:5000 name:L2D_CACHE_REFILL : Level 2 data cache refill ++event:0x18 um:zero minimum:5000 name:L2D_CACHE_WB : Level 2 data cache write-back ++event:0x19 um:zero minimum:5000 name:BUS_ACCESS : Bus access ++event:0x1A um:zero minimum:500 name:MEMORY_ERROR : Local memory error ++event:0x1B um:zero minimum:100000 name:INST_SPEC : Operation speculatively executed ++event:0x1C um:zero minimum:5000 name:TTBR_WRITE_RETIRED : Instruction architecturally executed, condition code check pass, write to TTBR ++event:0x1D um:zero minimum:5000 name:BUS_CYCLES : Bus cycle ++event:0x1F um:zero minimum:5000 name:L1D_CACHE_ALLOCATE : Level 1 data cache allocation without refill ++event:0x20 um:zero minimum:5000 name:L2D_CACHE_ALLOCATE : Level 2 data cache allocation without refill +diff --git a/events/arm/armv8-pmuv3-common/unit_masks b/events/arm/armv8-pmuv3-common/unit_masks +new file mode 100644 +index 0000000..7666c35 +--- /dev/null ++++ b/events/arm/armv8-pmuv3-common/unit_masks +@@ -0,0 +1,4 @@ ++# ARMv8 architected events unit masks ++# ++name:zero type:mandatory default:0x00 ++ 0x00 No unit mask +diff --git a/events/arm/armv8-xgene/events b/events/arm/armv8-xgene/events +new file mode 100644 +index 0000000..3e28463 +--- /dev/null ++++ b/events/arm/armv8-xgene/events +@@ -0,0 +1,7 @@ ++# ++# Copyright (c) Red Hat, 2014. ++# Contributed by William Cohen ++# ++# Basic ARM V8 events ++# ++include:arm/armv8-pmuv3-common +diff --git a/events/arm/armv8-xgene/unit_masks b/events/arm/armv8-xgene/unit_masks +new file mode 100644 +index 0000000..9ace2eb +--- /dev/null ++++ b/events/arm/armv8-xgene/unit_masks +@@ -0,0 +1,3 @@ ++# ARMv8 architected events unit masks ++# ++include:arm/armv8-pmuv3-common +diff --git a/libop/op_cpu_type.c b/libop/op_cpu_type.c +index 1ae2913..0cfb4ea 100644 +--- a/libop/op_cpu_type.c ++++ b/libop/op_cpu_type.c +@@ -129,6 +129,7 @@ static struct cpu_descr const cpu_descrs[MAX_CPU_TYPE] = { + { "ppc64 POWER8", "ppc64/power8", CPU_PPC64_POWER8, 6 }, + { "Intel Silvermont microarchitecture", "i386/silvermont", CPU_SILVERMONT, 2 }, + { "Intel Broadwell microarchitecture", "i386/broadwell", CPU_BROADWELL, 4 }, ++ { "APM X-Gene", "arm/armv8-xgene", CPU_ARM_V8_APM_XGENE, 6 }, + }; + + static size_t const nr_cpu_descrs = sizeof(cpu_descrs) / sizeof(struct cpu_descr); +@@ -395,6 +396,11 @@ static op_cpu _get_arm_cpu_type(void) + case 0xc0f: + return op_get_cpu_number("arm/armv7-ca15"); + } ++ } else if (vendorid == 0x50) { /* Applied Micro Circuits Corporation */ ++ switch (cpuid) { ++ case 0x000: ++ return op_get_cpu_number("arm/armv8-xgene"); ++ } + } else if (vendorid == 0x69) { /* Intel xscale */ + switch (cpuid >> 9) { + case 1: +@@ -631,7 +637,8 @@ static op_cpu __get_cpu_type_alt_method(void) + if (strncmp(uname_info.machine, "ppc64", 5) == 0) { + return _get_ppc64_cpu_type(); + } +- if (strncmp(uname_info.machine, "arm", 3) == 0) { ++ if (strncmp(uname_info.machine, "arm", 3) == 0 || ++ strncmp(uname_info.machine, "aarch64", 7) == 0) { + return _get_arm_cpu_type(); + } + if (strncmp(uname_info.machine, "tile", 4) == 0) { +diff --git a/libop/op_cpu_type.h b/libop/op_cpu_type.h +index 67e16de..7c478ad 100644 +--- a/libop/op_cpu_type.h ++++ b/libop/op_cpu_type.h +@@ -109,6 +109,7 @@ typedef enum { + CPU_PPC64_POWER8, /**< ppc64 POWER8 family */ + CPU_SILVERMONT, /** < Intel Silvermont microarchitecture */ + CPU_BROADWELL, /** < Intel Broadwell (Core-M) microarchitecture */ ++ CPU_ARM_V8_APM_XGENE, /* APM X-Gene */ + MAX_CPU_TYPE + } op_cpu; + +diff --git a/libop/op_events.c b/libop/op_events.c +index 358a154..e0d3ed5 100644 +--- a/libop/op_events.c ++++ b/libop/op_events.c +@@ -1253,6 +1253,7 @@ void op_default_event(op_cpu cpu_type, struct op_default_event_descr * descr) + case CPU_AVR32: + case CPU_ARM_SCORPION: + case CPU_ARM_SCORPIONMP: ++ case CPU_ARM_V8_XGENE: + descr->name = "CPU_CYCLES"; + break; + +diff --git a/utils/opcontrol b/utils/opcontrol +index 38bb1ac..04a4a91 100755 +--- a/utils/opcontrol ++++ b/utils/opcontrol +@@ -400,6 +400,11 @@ do_init() + do_deinit + exit 1 + ;; ++ aarch64/*) ++ echo "*** ARM AArch64 processors are not supported with opcontrol. Please use operf instead. ***" ++ do_deinit ++ exit 1 ++ ;; + esac + fi + +diff --git a/utils/ophelp.c b/utils/ophelp.c +index af4c1e5..35f47bc 100644 +--- a/utils/ophelp.c ++++ b/utils/ophelp.c +@@ -656,6 +656,13 @@ int main(int argc, char const * argv[]) + "Cortex A15 DDI (ARM DDI 0438F, revision r3p1)\n"; + break; + ++ case CPU_ARM_V8_APM_XGENE: ++ event_doc = ++ "See ARM Architecture Reference Manual \n" ++ "ARMv8, for ARMv8-A architecture profile\n" ++ "DDI (ARM DDI0487A.a)\n"; ++ break; ++ + case CPU_PPC64_PA6T: + event_doc = + "See PA6T Power Implementation Features Book IV\n" + + +commit a5eec42a9324915947e78634ddcce55b159a5dd2 +Author: Maynard Johnson +Date: Wed Feb 12 08:29:15 2014 -0600 + + Minor fixup for previous commit + + The previous commit for the new APM X-Gene (AaArch64 ARMv8) + processor went through a number of iterations before acceptance. + I missed changing one of the references to the new CPU type + from CPU_ARM_V8_XGENE to CPU_ARM_V8_APM_XGENE when I committed it. + This patch fixes that. + + Signed-off-by: Maynard Johnson + +diff --git a/libop/op_events.c b/libop/op_events.c +index e0d3ed5..77fc8a5 100644 +--- a/libop/op_events.c ++++ b/libop/op_events.c +@@ -1253,7 +1253,7 @@ void op_default_event(op_cpu cpu_type, struct op_default_event_descr * descr) + case CPU_AVR32: + case CPU_ARM_SCORPION: + case CPU_ARM_SCORPIONMP: +- case CPU_ARM_V8_XGENE: ++ case CPU_ARM_V8_APM_XGENE + descr->name = "CPU_CYCLES"; + break; + +commit c4e390042458aee07016da0cab251b0ad67b8d2b +Author: William Cohen +Date: Wed Feb 12 11:56:39 2014 -0500 + + Add missing ':' on case statement for CPU_ARM_V8_APM_XGENE + +diff --git a/libop/op_events.c b/libop/op_events.c +index 77fc8a5..968ff04 100644 +--- a/libop/op_events.c ++++ b/libop/op_events.c +@@ -1253,7 +1253,7 @@ void op_default_event(op_cpu cpu_type, struct op_default_event_descr * descr) + case CPU_AVR32: + case CPU_ARM_SCORPION: + case CPU_ARM_SCORPIONMP: +- case CPU_ARM_V8_APM_XGENE ++ case CPU_ARM_V8_APM_XGENE: + descr->name = "CPU_CYCLES"; + break; + +From 40adac210cf9ac8d79a90609c91b8ee5e05b8a2f Mon Sep 17 00:00:00 2001 +From: William Cohen +Date: Mon, 21 Jul 2014 14:36:23 -0400 +Subject: [PATCH 1/2] Add oprofile support for ARM Cortex A57 microarchitecture + +This patch adds the event list of the ARM Cortex A57 architecture. + +The patch is very straight forward: just add the model numbers and +type in the usual places and add the event list. + +Passes make check + +Signed-off-by: William Cohen +--- + events/Makefile.am | 1 + + events/arm/armv8-ca57/events | 67 ++++++++++++++++++++++++++++++++++++++++ + events/arm/armv8-ca57/unit_masks | 3 ++ + libop/op_cpu_type.c | 3 ++ + libop/op_cpu_type.h | 1 + + libop/op_events.c | 1 + + utils/ophelp.c | 6 ++++ + 7 files changed, 82 insertions(+) + create mode 100644 events/arm/armv8-ca57/events + create mode 100644 events/arm/armv8-ca57/unit_masks + +diff --git a/events/Makefile.am b/events/Makefile.am +index f6fd3d7..b4bca1e 100644 +--- a/events/Makefile.am ++++ b/events/Makefile.am +@@ -62,6 +62,7 @@ event_files = \ + arm/mpcore/events arm/mpcore/unit_masks \ + arm/armv8-pmuv3-common/events arm/armv8-pmuv3-common/unit_masks \ + arm/armv8-xgene/events arm/armv8-xgene/unit_masks \ ++ arm/armv8-ca57/events arm/armv8-ca57/unit_masks \ + avr32/events avr32/unit_masks \ + mips/20K/events mips/20K/unit_masks \ + mips/24K/events mips/24K/unit_masks \ +diff --git a/events/arm/armv8-ca57/events b/events/arm/armv8-ca57/events +new file mode 100644 +index 0000000..62974c1 +--- /dev/null ++++ b/events/arm/armv8-ca57/events +@@ -0,0 +1,67 @@ ++# ++# Copyright (c) Red Hat, 2014. ++# Contributed by William Cohen ++# ++# ARM Cortex A57 events ++# From Cortex A57 TRM ++# ++include:arm/armv8-pmuv3-common ++event:0x40 um:zero minimum:10007 name:L1D_CACHE_LD : Level 1 data cache access - Read ++event:0x41 um:zero minimum:10007 name:L1D_CACHE_ST : Level 1 data cache access - Write ++event:0x42 um:zero minimum:10007 name:L1D_CACHE_REFILL_LD : Level 1 data cache refill - Read ++event:0x43 um:zero minimum:10007 name:L1D_CACHE_REFILL_ST : Level 1 data cache refill - Write ++event:0x46 um:zero minimum:10007 name:L1D_CACHE_WB_VICTIM : Level 1 data cache Write-back - Victim ++event:0x47 um:zero minimum:10007 name:L1D_CACHE_WB_CLEAN : Level 1 data cache Write-back - Cleaning event:and coherency ++event:0x48 um:zero minimum:10007 name:L1D_CACHE_INVAL : Level 1 data cache invalidate ++event:0x4C um:zero minimum:10007 name:L1D_TLB_REFILL_LD : Level 1 data TLB refill - Read ++event:0x4D um:zero minimum:10007 name:L1D_TLB_REFILL_ST : Level 1 data TLB refill - Write ++event:0x50 um:zero minimum:10007 name:L2D_CACHE_LD : Level 2 data cache access - Read ++event:0x51 um:zero minimum:10007 name:L2D_CACHE_ST : Level 2 data cache access - Write ++event:0x52 um:zero minimum:10007 name:L2D_CACHE_REFILL_LD : Level 2 data cache refill - Read ++event:0x53 um:zero minimum:10007 name:L2D_CACHE_REFILL_ST : Level 2 data cache refill - Write ++event:0x56 um:zero minimum:10007 name:L2D_CACHE_WB_VICTIM : Level 2 data cache Write-back - Victim ++event:0x57 um:zero minimum:10007 name:L2D_CACHE_WB_CLEAN : Level 2 data cache Write-back - Cleaning and coherency ++event:0x58 um:zero minimum:10007 name:L2D_CACHE_INVAL : Level 2 data cache invalidate ++event:0x60 um:zero minimum:10007 name:BUS_ACCESS_LD : Bus access - Read ++event:0x61 um:zero minimum:10007 name:BUS_ACCESS_ST : Bus access - Write ++event:0x62 um:zero minimum:10007 name:BUS_ACCESS_SHARED : Bus access - Normal ++event:0x63 um:zero minimum:10007 name:BUS_ACCESS_NOT_SHARED : Bus access - Not normal ++event:0x64 um:zero minimum:10007 name:BUS_ACCESS_NORMAL : Bus access - Normal ++event:0x65 um:zero minimum:10007 name:BUS_ACCESS_PERIPH : Bus access - Peripheral ++event:0x66 um:zero minimum:10007 name:MEM_ACCESS_LD : Data memory access - Read ++event:0x67 um:zero minimum:10007 name:MEM_ACCESS_ST : Data memory access - Write ++event:0x68 um:zero minimum:10007 name:UNALIGNED_LD_SPEC : Unaligned access - Read ++event:0x69 um:zero minimum:10007 name:UNALIGNED_ST_SPEC : Unaligned access - Write ++event:0x6A um:zero minimum:10007 name:UNALIGNED_LDST_SPEC : Unaligned access ++event:0x6C um:zero minimum:10007 name:LDREX_SPEC : Exclusive operation speculatively executed - LDREX ++event:0x6D um:zero minimum:10007 name:STREX_PASS_SPEC : Exclusive instruction speculatively executed - STREX pass ++event:0x6E um:zero minimum:10007 name:STREX_FAIL_SPEC : Exclusive operation speculatively executed - STREX fail ++event:0x70 um:zero minimum:10007 name:LD_SPEC : Operation speculatively executed - Load ++event:0x71 um:zero minimum:10007 name:ST_SPEC : Operation speculatively executed - Store ++event:0x72 um:zero minimum:10007 name:LDST_SPEC : Operation speculatively executed - Load or store ++event:0x73 um:zero minimum:10007 name:DP_SPEC : Operation speculatively executed - Integer data processing ++event:0x74 um:zero minimum:10007 name:ASE_SPEC : Operation speculatively executed - Advanced SIMD ++event:0x75 um:zero minimum:10007 name:VFP_SPEC : Operation speculatively executed - VFP ++event:0x76 um:zero minimum:10007 name:PC_WRITE_SPEC : Operation speculatively executed - Software change of the PC ++event:0x77 um:zero minimum:10007 name:CRYPTO_SPEC : Operation speculatively executed, crypto data processing ++event:0x78 um:zero minimum:10007 name:BR_IMMED_SPEC : Branch speculatively executed - Immediate branch ++event:0x79 um:zero minimum:10007 name:BR_RETURN_SPEC : Branch speculatively executed - Procedure return ++event:0x7A um:zero minimum:10007 name:BR_INDIRECT_SPEC : Branch speculatively executed - Indirect branch ++event:0x7C um:zero minimum:10007 name:ISB_SPEC : Barrier speculatively executed - ISB ++event:0x7D um:zero minimum:10007 name:DSB_SPEC : Barrier speculatively executed - DSB ++event:0x7E um:zero minimum:10007 name:DMB_SPEC : Barrier speculatively executed - DMB ++event:0x81 um:zero minimum:10007 name:EXC_UNDEF : Exception taken, other synchronous ++event:0x82 um:zero minimum:10007 name:EXC_SVC : Exception taken, Supervisor Call ++event:0x83 um:zero minimum:10007 name:EXC_PABORT : Exception taken, Instruction Abort ++event:0x84 um:zero minimum:10007 name:EXC_DABORT : Exception taken, Data Abort or SError ++event:0x86 um:zero minimum:10007 name:EXC_IRQ : Exception taken, IRQ ++event:0x87 um:zero minimum:10007 name:EXC_FIQ : Exception taken, FIQ ++event:0x88 um:zero minimum:10007 name:EXC_SMC : Exception taken, Secure Monitor Call ++event:0x8A um:zero minimum:10007 name:EXC_HVC : Exception taken, Hypervisor Call ++event:0x8B um:zero minimum:10007 name:EXC_TRAP_PABORT : Exception taken, Instruction Abort not taken locally ++event:0x8C um:zero minimum:10007 name:EXC_TRAP_DABORT : Exception taken, Data Abort, or SError not taken locally ++event:0x8D um:zero minimum:10007 name:EXC_TRAP_OTHER : Exception taken – Other traps not taken locally ++event:0x8E um:zero minimum:10007 name:EXC_TRAP_IRQ : Exception taken, IRQ not taken locally ++event:0x8F um:zero minimum:10007 name:EXC_TRAP_FIQ : Exception taken, FIQ not taken locally ++event:0x90 um:zero minimum:10007 name:RC_LD_SPEC : Release consistency instruction speculatively executed – Load-Acquire ++event:0x91 um:zero minimum:10007 name:RC_ST_SPEC : Release consistency instruction speculatively executed – Store-Release +diff --git a/events/arm/armv8-ca57/unit_masks b/events/arm/armv8-ca57/unit_masks +new file mode 100644 +index 0000000..5d69263 +--- /dev/null ++++ b/events/arm/armv8-ca57/unit_masks +@@ -0,0 +1,3 @@ ++# ARMv8 Cortex A57 unit masks ++# ++include:arm/armv8-pmuv3-common +diff --git a/libop/op_cpu_type.c b/libop/op_cpu_type.c +index bce230a..163bd1c 100644 +--- a/libop/op_cpu_type.c ++++ b/libop/op_cpu_type.c +@@ -131,6 +131,7 @@ static struct cpu_descr const cpu_descrs[MAX_CPU_TYPE] = { + { "Intel Silvermont microarchitecture", "i386/silvermont", CPU_SILVERMONT, 2 }, + { "Intel Broadwell microarchitecture", "i386/broadwell", CPU_BROADWELL, 4 }, + { "APM X-Gene", "arm/armv8-xgene", CPU_ARM_V8_APM_XGENE, 6 }, ++ { "ARM Cortex-A57", "arm/armv8-ca57", CPU_ARM_V8_CA57, 6}, + }; + + static size_t const nr_cpu_descrs = sizeof(cpu_descrs) / sizeof(struct cpu_descr); +@@ -396,6 +397,8 @@ static op_cpu _get_arm_cpu_type(void) + return op_get_cpu_number("arm/armv7-ca9"); + case 0xc0f: + return op_get_cpu_number("arm/armv7-ca15"); ++ case 0xd07: ++ return op_get_cpu_number("arm/armv8-ca57"); + } + } else if (vendorid == 0x50) { /* Applied Micro Circuits Corporation */ + switch (cpuid) { +diff --git a/libop/op_cpu_type.h b/libop/op_cpu_type.h +index 3754156..aebd7f6 100644 +--- a/libop/op_cpu_type.h ++++ b/libop/op_cpu_type.h +@@ -111,6 +111,7 @@ typedef enum { + CPU_SILVERMONT, /** < Intel Silvermont microarchitecture */ + CPU_BROADWELL, /** < Intel Broadwell (Core-M) microarchitecture */ + CPU_ARM_V8_APM_XGENE, /* APM X-Gene */ ++ CPU_ARM_V8_CA57, /* ARM Cortex-A57 */ + MAX_CPU_TYPE + } op_cpu; + +diff --git a/libop/op_events.c b/libop/op_events.c +index b8900a5..d5249b7 100644 +--- a/libop/op_events.c ++++ b/libop/op_events.c +@@ -1255,6 +1255,7 @@ void op_default_event(op_cpu cpu_type, struct op_default_event_descr * descr) + case CPU_ARM_SCORPION: + case CPU_ARM_SCORPIONMP: + case CPU_ARM_V8_APM_XGENE: ++ case CPU_ARM_V8_CA57: + descr->name = "CPU_CYCLES"; + break; + +diff --git a/utils/ophelp.c b/utils/ophelp.c +index bf3fbcb..a5edf56 100644 +--- a/utils/ophelp.c ++++ b/utils/ophelp.c +@@ -664,6 +664,12 @@ int main(int argc, char const * argv[]) + "DDI (ARM DDI0487A.a)\n"; + break; + ++ case CPU_ARM_V8_CA57: ++ event_doc = ++ "See Cortex-A57 MPCore Technical Reference Manual\n" ++ "Cortex A57 DDI (ARM DDI 0488D, revision r1p1)\n"; ++ break; ++ + case CPU_PPC64_PA6T: + event_doc = + "See PA6T Power Implementation Features Book IV\n" +-- +1.9.3 + +From 78db0d3eb65e6005931b0402484e759c35df79f1 Mon Sep 17 00:00:00 2001 +From: William Cohen +Date: Wed, 23 Jul 2014 23:25:21 -0400 +Subject: [PATCH] Add oprofile support for ARM Cortex A53 microarchitecture + +This patch adds the event list of the ARM Cortex A53 architecture. + +The patch is very straight forward: just add the model numbers and +type in the usual places and add the event list. + +Passes make check + +Signed-off-by: William Cohen +--- + events/Makefile.am | 1 + + events/arm/armv8-ca53/events | 38 ++++++++++++++++++++++++++++++++++++++ + events/arm/armv8-ca53/unit_masks | 3 +++ + libop/op_cpu_type.c | 3 +++ + libop/op_cpu_type.h | 1 + + libop/op_events.c | 1 + + utils/ophelp.c | 6 ++++++ + 7 files changed, 53 insertions(+) + create mode 100644 events/arm/armv8-ca53/events + create mode 100644 events/arm/armv8-ca53/unit_masks + +diff --git a/events/Makefile.am b/events/Makefile.am +index b4bca1e..67be125 100644 +--- a/events/Makefile.am ++++ b/events/Makefile.am +@@ -63,6 +63,7 @@ event_files = \ + arm/armv8-pmuv3-common/events arm/armv8-pmuv3-common/unit_masks \ + arm/armv8-xgene/events arm/armv8-xgene/unit_masks \ + arm/armv8-ca57/events arm/armv8-ca57/unit_masks \ ++ arm/armv8-ca53/events arm/armv8-ca53/unit_masks \ + avr32/events avr32/unit_masks \ + mips/20K/events mips/20K/unit_masks \ + mips/24K/events mips/24K/unit_masks \ +diff --git a/events/arm/armv8-ca53/events b/events/arm/armv8-ca53/events +new file mode 100644 +index 0000000..5e1b4d8 +--- /dev/null ++++ b/events/arm/armv8-ca53/events +@@ -0,0 +1,38 @@ ++# ++# Copyright (c) Red Hat, 2014. ++# Contributed by William Cohen ++# ++# ARM Cortex A53 events ++# From Cortex A53 TRM ++# ++include:arm/armv8-pmuv3-common ++event:0x60 um:zero minimum:10007 name:BUS_ACCESS_LD : Bus access - Read ++event:0x61 um:zero minimum:10007 name:BUS_ACCESS_ST : Bus access - Write ++event:0x7A um:zero minimum:10007 name:BR_INDIRECT_SPEC : Branch speculatively executed - Indirect branch ++event:0x86 um:zero minimum:10007 name:EXC_IRQ : Exception taken, IRQ ++event:0x87 um:zero minimum:10007 name:EXC_FIQ : Exception taken, FIQ ++event:0xC0 um:zero minimum:10007 name:EXT_MEM_REQ : External memory request ++event:0xC1 um:zero minimum:10007 name:EXT_MEM_REQ_NC : Non-cacheable external memory request ++event:0xC2 um:zero minimum:10007 name:PREFETCH_LINEFILL : Linefill because of prefetch ++event:0xC3 um:zero minimum:10007 name:PREFETCH_LINEFILL_DROP : Instruction Cache Throttle occurred ++event:0xC4 um:zero minimum:10007 name:READ_ALLOC_ENTER : Entering read allocate mode ++event:0xC5 um:zero minimum:10007 name:READ_ALLOC : Read allocate mode ++event:0xC6 um:zero minimum:10007 name:PRE_DECODE_ERR : Pre-decode error ++event:0xC7 um:zero minimum:10007 name:STALL_SB_FULL : Data Write operation that stalls the pipeline because the store buffer is full ++event:0xC8 um:zero minimum:10007 name:EXT_SNOOP : SCU Snooped data from another CPU for this CPU ++event:0xC9 um:zero minimum:10007 name:BR_COND : Conditional branch executed ++event:0xCA um:zero minimum:10007 name:BR_INDIRECT_MISPRED : Indirect branch mispredicted ++event:0xCB um:zero minimum:10007 name:BR_INDIRECT_MISPRED_ADDR : Indirect branch mispredicted because of address miscompare ++event:0xCC um:zero minimum:10007 name:BR_COND_MISPRED : Conditional branch mispredicted ++event:0xD0 um:zero minimum:10007 name:L1I_CACHE_ERR : L1 Instruction Cache (data or tag) memory error ++event:0xD1 um:zero minimum:10007 name:L1D_CACHE_ERR : L1 Data Cache (data, tag or dirty) memory error, correctable or non-correctable ++event:0xD2 um:zero minimum:10007 name:TLB_ERR : TLB memory error ++event:0xE0 um:zero minimum:10007 name:OTHER_IQ_DEP_STALL : Cycles that the DPU IQ is empty and that is not because of a recent micro-TLB miss, instruction cache miss or pre-decode error ++event:0xE1 um:zero minimum:10007 name:IC_DEP_STALL : Cycles the DPU IQ is empty and there is an instruction cache miss being processed ++event:0xE2 um:zero minimum:10007 name:IUTLB_DEP_STALL : Cycles the DPU IQ is empty and there is an instruction micro-TLB miss being processed ++event:0xE3 um:zero minimum:10007 name:DECODE_DEP_STALL : Cycles the DPU IQ is empty and there is a pre-decode error being processed ++event:0xE4 um:zero minimum:10007 name:OTHER_INTERLOCK_STALL : Cycles there is an interlock other than Advanced SIMD/Floating-point instructions or load/store instruction ++event:0xE5 um:zero minimum:10007 name:AGU_DEP_STALL : Cycles there is an interlock for a load/store instruction waiting for data to calculate the address in the AGU ++event:0xE6 um:zero minimum:10007 name:SIMD_DEP_STALL : Cycles there is an interlock for an Advanced SIMD/Floating-point operation. ++event:0xE7 um:zero minimum:10007 name:LD_DEP_STALL : Cycles there is a stall in the Wr stage because of a load miss ++event:0xE8 um:zero minimum:10007 name:ST_DEP_STALL : Cycles there is a stall in the Wr stage because of a store +diff --git a/events/arm/armv8-ca53/unit_masks b/events/arm/armv8-ca53/unit_masks +new file mode 100644 +index 0000000..42b12b4 +--- /dev/null ++++ b/events/arm/armv8-ca53/unit_masks +@@ -0,0 +1,3 @@ ++# ARMv8 Cortex A53 unit masks ++# ++include:arm/armv8-pmuv3-common +diff --git a/libop/op_cpu_type.c b/libop/op_cpu_type.c +index 163bd1c..055c64b 100644 +--- a/libop/op_cpu_type.c ++++ b/libop/op_cpu_type.c +@@ -132,6 +132,7 @@ static struct cpu_descr const cpu_descrs[MAX_CPU_TYPE] = { + { "Intel Broadwell microarchitecture", "i386/broadwell", CPU_BROADWELL, 4 }, + { "APM X-Gene", "arm/armv8-xgene", CPU_ARM_V8_APM_XGENE, 6 }, + { "ARM Cortex-A57", "arm/armv8-ca57", CPU_ARM_V8_CA57, 6}, ++ { "ARM Cortex-A53", "arm/armv8-ca53", CPU_ARM_V8_CA53, 6}, + }; + + static size_t const nr_cpu_descrs = sizeof(cpu_descrs) / sizeof(struct cpu_descr); +@@ -399,6 +400,8 @@ static op_cpu _get_arm_cpu_type(void) + return op_get_cpu_number("arm/armv7-ca15"); + case 0xd07: + return op_get_cpu_number("arm/armv8-ca57"); ++ case 0xd03: ++ return op_get_cpu_number("arm/armv8-ca53"); + } + } else if (vendorid == 0x50) { /* Applied Micro Circuits Corporation */ + switch (cpuid) { +diff --git a/libop/op_cpu_type.h b/libop/op_cpu_type.h +index aebd7f6..a6bb323 100644 +--- a/libop/op_cpu_type.h ++++ b/libop/op_cpu_type.h +@@ -112,6 +112,7 @@ typedef enum { + CPU_BROADWELL, /** < Intel Broadwell (Core-M) microarchitecture */ + CPU_ARM_V8_APM_XGENE, /* APM X-Gene */ + CPU_ARM_V8_CA57, /* ARM Cortex-A57 */ ++ CPU_ARM_V8_CA53, /* ARM Cortex-A53 */ + MAX_CPU_TYPE + } op_cpu; + +diff --git a/libop/op_events.c b/libop/op_events.c +index d5249b7..bbeb212 100644 +--- a/libop/op_events.c ++++ b/libop/op_events.c +@@ -1256,6 +1256,7 @@ void op_default_event(op_cpu cpu_type, struct op_default_event_descr * descr) + case CPU_ARM_SCORPIONMP: + case CPU_ARM_V8_APM_XGENE: + case CPU_ARM_V8_CA57: ++ case CPU_ARM_V8_CA53: + descr->name = "CPU_CYCLES"; + break; + +diff --git a/utils/ophelp.c b/utils/ophelp.c +index a5edf56..980c6dc 100644 +--- a/utils/ophelp.c ++++ b/utils/ophelp.c +@@ -670,6 +670,12 @@ int main(int argc, char const * argv[]) + "Cortex A57 DDI (ARM DDI 0488D, revision r1p1)\n"; + break; + ++ case CPU_ARM_V8_CA53: ++ event_doc = ++ "See Cortex-A53 MPCore Technical Reference Manual\n" ++ "Cortex A57 DDI (ARM DDI 0500D, revision r0p2)\n"; ++ break; ++ + case CPU_PPC64_PA6T: + event_doc = + "See PA6T Power Implementation Features Book IV\n" +-- +1.9.3 + +From 76464b279cf20bb0bb40e758afb32eaf4195d861 Mon Sep 17 00:00:00 2001 +From: Maynard Johnson +Date: Fri, 1 Aug 2014 09:06:17 -0500 +Subject: [PATCH 1/2] Add another ARM internal mapping symbol to ignore + +Ignore "$x" symbols, which can show up as internal +mapping symbols in binaries built on Aarch64. + +Reported-byP: Andrew Haley +Signed-off-by: Maynard Johnson +--- + libutil++/bfd_support.cpp | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/libutil++/bfd_support.cpp b/libutil++/bfd_support.cpp +index a3bee99..0554616 100644 +--- a/libutil++/bfd_support.cpp ++++ b/libutil++/bfd_support.cpp +@@ -475,7 +475,8 @@ bool interesting_symbol(asymbol * sym) + /* ARM assembler internal mapping symbols aren't interesting */ + if ((strcmp("$a", sym->name) == 0) || + (strcmp("$t", sym->name) == 0) || +- (strcmp("$d", sym->name) == 0)) ++ (strcmp("$d", sym->name) == 0))|| ++ (strcmp("$x", sym->name) == 0)) + return false; + + // C++ exception stuff +-- +1.9.3 + +From a4bdbc9ce94b15df3d19d60a11e4c4f2fc729cd9 Mon Sep 17 00:00:00 2001 +From: Maynard Johnson +Date: Fri, 1 Aug 2014 09:25:55 -0500 +Subject: [PATCH 2/2] Fix mis-placed parentheses in previous commit that caused + build error + +Signed-off-by: Maynard Johnson +--- + libutil++/bfd_support.cpp | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/libutil++/bfd_support.cpp b/libutil++/bfd_support.cpp +index 0554616..d5fd70d 100644 +--- a/libutil++/bfd_support.cpp ++++ b/libutil++/bfd_support.cpp +@@ -475,7 +475,7 @@ bool interesting_symbol(asymbol * sym) + /* ARM assembler internal mapping symbols aren't interesting */ + if ((strcmp("$a", sym->name) == 0) || + (strcmp("$t", sym->name) == 0) || +- (strcmp("$d", sym->name) == 0))|| ++ (strcmp("$d", sym->name) == 0) || + (strcmp("$x", sym->name) == 0)) + return false; + +-- +1.9.3 + diff --git a/SOURCES/oprofile-broadwell.patch b/SOURCES/oprofile-broadwell.patch new file mode 100644 index 0000000..a08001d --- /dev/null +++ b/SOURCES/oprofile-broadwell.patch @@ -0,0 +1,1040 @@ +commit 6d692179cb44e68a3cfaeac213e3244f858676b8 +Author: Andi Kleen +Date: Wed Jul 16 08:03:54 2014 -0500 + + Add oprofile support for Broadwell microarchitecture + + This patch adds the event list of the Intel Broadwell architecture. + Hopefully this can still make 1.0 + + The patch is very straight forward: just add the model numbers and + type in the usual places and add the event list. + + Passes make check + + Some notes: + - Haswell included one Broadwell model number by mistake. I moved + that to Broadwell now. + - oprofile doesn't support umask sub events with different counter + constraints than other events. This affects a few events on Broadwell. + However it's not a problem when oprofile uses perf as a backend, + as perf will know how to schedule these events (once it gets the + Broadwell support). It won't work correctly with the old driver. + Most of these events are not too useful for sampling, so in practice + it's not a real problem. + - As usual PEBS events and events with offcore mask and uncore + events are missing. + + Signed-off-by: Andi Kleen + +diff --git a/events/Makefile.am b/events/Makefile.am +index 3e43d10..f6fd3d7 100644 +--- a/events/Makefile.am ++++ b/events/Makefile.am +@@ -21,6 +21,7 @@ event_files = \ + i386/sandybridge/events i386/sandybridge/unit_masks \ + i386/ivybridge/events i386/ivybridge/unit_masks \ + i386/haswell/events i386/haswell/unit_masks \ ++ i386/broadwell/events i386/broadwell/unit_masks \ + i386/silvermont/events i386/silvermont/unit_masks \ + ia64/ia64/events ia64/ia64/unit_masks \ + ia64/itanium2/events ia64/itanium2/unit_masks \ +diff --git a/events/i386/broadwell/events b/events/i386/broadwell/events +new file mode 100644 +index 0000000..6a4b388 +--- /dev/null ++++ b/events/i386/broadwell/events +@@ -0,0 +1,65 @@ ++# ++# Intel "Broadwell" microarchitecture core events. ++# ++# See http://ark.intel.com/ for help in identifying Broadwell based CPUs ++# ++# Note the minimum counts are not discovered experimentally and could be likely ++# lowered in many cases without ill effect. ++# ++include:i386/arch_perfmon ++event:0x03 counters:cpuid um:ld_blocks minimum:100003 name:ld_blocks : ++event:0x05 counters:cpuid um:misalign_mem_ref minimum:2000003 name:misalign_mem_ref : ++event:0x07 counters:cpuid um:one minimum:100003 name:ld_blocks_partial_address_alias : ++event:0x08 counters:cpuid um:dtlb_load_misses minimum:2000003 name:dtlb_load_misses : ++event:0x0d counters:cpuid um:x03 minimum:2000003 name:int_misc_recovery_cycles : ++event:0x0e counters:cpuid um:uops_issued minimum:2000003 name:uops_issued : ++event:0x14 counters:cpuid um:one minimum:2000003 name:arith_fpu_div_active : ++event:0x24 counters:cpuid um:l2_rqsts minimum:200003 name:l2_rqsts : ++event:0x27 counters:cpuid um:x50 minimum:200003 name:l2_demand_rqsts_wb_hit : ++event:0x48 counters:2 um:l1d_pend_miss minimum:2000003 name:l1d_pend_miss : ++event:0x49 counters:cpuid um:dtlb_store_misses minimum:100003 name:dtlb_store_misses : ++event:0x4c counters:cpuid um:x02 minimum:100003 name:load_hit_pre_hw_pf : ++event:0x4f counters:cpuid um:x10 minimum:2000003 name:ept_walk_cycles : ++event:0x51 counters:cpuid um:one minimum:2000003 name:l1d_replacement : ++event:0x54 counters:cpuid um:tx_mem minimum:2000003 name:tx_mem : ++event:0x58 counters:cpuid um:move_elimination minimum:1000003 name:move_elimination : ++event:0x5c counters:cpuid um:cpl_cycles minimum:2000003 name:cpl_cycles : ++event:0x5d counters:cpuid um:tx_exec minimum:2000003 name:tx_exec : ++event:0x5e counters:cpuid um:rs_events minimum:2000003 name:rs_events : ++event:0x60 counters:cpuid um:offcore_requests_outstanding minimum:2000003 name:offcore_requests_outstanding : ++event:0x63 counters:cpuid um:lock_cycles minimum:2000003 name:lock_cycles : ++event:0x79 counters:0,1,2,3 um:idq minimum:2000003 name:idq : ++event:0x80 counters:cpuid um:x02 minimum:200003 name:icache_misses : ++event:0x85 counters:cpuid um:itlb_misses minimum:100003 name:itlb_misses : ++event:0x87 counters:cpuid um:one minimum:2000003 name:ild_stall_lcp : ++event:0x88 counters:cpuid um:br_inst_exec minimum:200003 name:br_inst_exec : ++event:0x89 counters:cpuid um:br_misp_exec minimum:200003 name:br_misp_exec : ++event:0x9c counters:0,1,2,3 um:idq_uops_not_delivered minimum:2000003 name:idq_uops_not_delivered : ++event:0xa1 counters:cpuid um:uops_executed_port minimum:2000003 name:uops_executed_port : ++event:0xa1 counters:cpuid um:uops_dispatched_port minimum:2000003 name:uops_dispatched_port : ++event:0xa2 counters:cpuid um:resource_stalls minimum:2000003 name:resource_stalls : ++event:0xa3 counters:2 um:cycle_activity minimum:2000003 name:cycle_activity : ++event:0xa8 counters:cpuid um:lsd minimum:2000003 name:lsd : ++event:0xab counters:cpuid um:x02 minimum:2000003 name:dsb2mite_switches_penalty_cycles : ++event:0xae counters:cpuid um:one minimum:100007 name:itlb_itlb_flush : ++event:0xb0 counters:cpuid um:offcore_requests minimum:100003 name:offcore_requests : ++event:0xb1 counters:cpuid um:uops_executed minimum:2000003 name:uops_executed : ++event:0xbc counters:0,1,2,3 um:page_walker_loads minimum:2000003 name:page_walker_loads : ++event:0xc0 counters:1 um:inst_retired minimum:2000003 name:inst_retired : ++event:0xc1 counters:cpuid um:other_assists minimum:100003 name:other_assists : ++event:0xc2 counters:cpuid um:uops_retired minimum:2000003 name:uops_retired : ++event:0xc3 counters:cpuid um:machine_clears minimum:2000003 name:machine_clears : ++event:0xc4 counters:cpuid um:br_inst_retired minimum:400009 name:br_inst_retired : ++event:0xc5 counters:cpuid um:br_misp_retired minimum:400009 name:br_misp_retired : ++event:0xc8 counters:cpuid um:hle_retired minimum:2000003 name:hle_retired : ++event:0xc9 counters:0,1,2,3 um:rtm_retired minimum:2000003 name:rtm_retired : ++event:0xca counters:cpuid um:fp_assist minimum:100003 name:fp_assist : ++event:0xcc counters:cpuid um:x20 minimum:2000003 name:rob_misc_events_lbr_inserts : ++event:0xd0 counters:0,1,2,3 um:mem_uops_retired minimum:2000003 name:mem_uops_retired : ++event:0xd1 counters:0,1,2,3 um:mem_load_uops_retired minimum:2000003 name:mem_load_uops_retired : ++event:0xd2 counters:0,1,2,3 um:mem_load_uops_l3_hit_retired minimum:100003 name:mem_load_uops_l3_hit_retired : ++event:0xd3 counters:0,1,2,3 um:one minimum:100007 name:mem_load_uops_l3_miss_retired_local_dram : ++event:0xe6 counters:cpuid um:x1f minimum:100003 name:baclears_any : ++event:0xf0 counters:cpuid um:l2_trans minimum:200003 name:l2_trans : ++event:0xf1 counters:cpuid um:l2_lines_in minimum:100003 name:l2_lines_in : ++event:0xf2 counters:cpuid um:x05 minimum:100003 name:l2_lines_out_demand_clean : +diff --git a/events/i386/broadwell/unit_masks b/events/i386/broadwell/unit_masks +new file mode 100644 +index 0000000..470e9e9 +--- /dev/null ++++ b/events/i386/broadwell/unit_masks +@@ -0,0 +1,316 @@ ++# ++# Unit masks for the Intel "Broadwell" micro architecture ++# ++# See http://ark.intel.com/ for help in identifying Broadwell based CPUs ++# ++include:i386/arch_perfmon ++name:x02 type:mandatory default:0x2 ++ 0x2 No unit mask ++name:x03 type:mandatory default:0x3 ++ 0x3 No unit mask ++name:x05 type:mandatory default:0x5 ++ 0x5 No unit mask ++name:x10 type:mandatory default:0x10 ++ 0x10 No unit mask ++name:x1f type:mandatory default:0x1f ++ 0x1f No unit mask ++name:x20 type:mandatory default:0x20 ++ 0x20 No unit mask ++name:x50 type:mandatory default:0x50 ++ 0x50 No unit mask ++name:ld_blocks type:exclusive default:0x2 ++ 0x2 extra: store_forward This event counts how many times the load operation got the true Block-on-Store blocking code preventing store forwarding. This includes cases when: - preceding store conflicts with the load (incomplete overlap); - store forwarding is impossible due to u-arch limitations; - preceding lock RMW operations are not forwarded; - store has the no-forward bit set (uncacheable/page-split/masked stores); - all-blocking stores are used (mostly, fences and port I/O); and others. The most common case is a load blocked due to its address range overlapping with a preceding smaller uncompleted store. Note: This event does not take into account cases of out-of-SW-control (for example, SbTailHit), unknown physical STA, and cases of blocking loads on store due to being non-WB memory type or a lock. These cases are covered by other events. See the table of not supported store forwards in the Optimization Guide. ++ 0x8 extra: no_sr This event counts the number of times that split load operations are temporarily blocked because all resources for handling the split accesses are in use. ++name:misalign_mem_ref type:exclusive default:0x1 ++ 0x1 extra: loads This event counts speculative cache-line split load uops dispatched to the L1 cache. ++ 0x2 extra: stores This event counts speculative cache line split store-address (STA) uops dispatched to the L1 cache. ++name:dtlb_load_misses type:exclusive default:0x1 ++ 0x1 extra: miss_causes_a_walk This event counts load misses in all DTLB levels that cause page walks of any page size (4K/2M/4M/1G). ++ 0x2 extra: walk_completed_4k This event counts load misses in all DTLB levels that cause a completed page walk (4K page size). The page walk can end with or without a fault. ++ 0x10 extra: walk_duration This event counts the number of cycles while PMH is busy with the page walk. ++ 0x20 extra: stlb_hit_4k Load misses that miss the DTLB and hit the STLB (4K) ++ 0xe extra: walk_completed Demand load Miss in all translation lookaside buffer (TLB) levels causes a page walk that completes of any page size. ++ 0x60 extra: stlb_hit Load operations that miss the first DTLB level but hit the second and do not cause page walks ++name:uops_issued type:exclusive default:0x1 ++ 0x1 extra: any This event counts the number of Uops issued by the Resource Allocation Table (RAT) to the reservation station (RS). ++ 0x10 extra: flags_merge Number of flags-merge uops being allocated. Such uops considered perf sensitive; added by GSR u-arch. ++ 0x20 extra: slow_lea Number of slow LEA uops being allocated. A uop is generally considered SlowLea if it has 3 sources (e.g. 2 sources + immediate) regardless if as a result of LEA instruction or not. ++ 0x40 extra: single_mul Number of Multiply packed/scalar single precision uops allocated ++ 0x1 extra:inv stall_cycles This event counts cycles during which the Resource Allocation Table (RAT) does not issue any Uops to the reservation station (RS) for the current thread. ++name:l2_rqsts type:exclusive default:0x21 ++ 0x21 extra: demand_data_rd_miss This event counts the number of demand Data Read requests that miss L2 cache. Only not rejected loads are counted. ++ 0x41 extra: demand_data_rd_hit This event counts the number of demand Data Read requests that hit L2 cache. Only not rejected loads are counted. ++ 0x30 extra: l2_pf_miss This event counts the number of requests from the L2 hardware prefetchers that miss L2 cache. ++ 0x50 extra: l2_pf_hit This event counts the number of requests from the L2 hardware prefetchers that hit L2 cache. L3 prefetch new types ++ 0xe1 extra: all_demand_data_rd This event counts the number of demand Data Read requests (including requests from L1D hardware prefetchers). These loads may hit or miss L2 cache. Only non rejected loads are counted. ++ 0xe2 extra: all_rfo This event counts the total number of RFO (read for ownership) requests to L2 cache. L2 RFO requests include both L1D demand RFO misses as well as L1D RFO prefetches. ++ 0xe4 extra: all_code_rd This event counts the total number of L2 code requests. ++ 0xf8 extra: all_pf This event counts the total number of requests from the L2 hardware prefetchers. ++ 0x42 extra: rfo_hit RFO requests that hit L2 cache ++ 0x22 extra: rfo_miss RFO requests that miss L2 cache ++ 0x44 extra: code_rd_hit L2 cache hits when fetching instructions, code reads. ++ 0x24 extra: code_rd_miss L2 cache misses when fetching instructions ++ 0x27 extra: all_demand_miss Demand requests that miss L2 cache ++ 0xe7 extra: all_demand_references Demand requests to L2 cache ++ 0x3f extra: miss All requests that miss L2 cache ++ 0xff extra: references All L2 requests ++name:l1d_pend_miss type:exclusive default:0x1 ++ 0x1 extra: pending This event counts duration of L1D miss outstanding, that is each cycle number of Fill Buffers (FB) outstanding required by Demand Reads. FB either is held by demand loads, or it is held by non-demand loads and gets hit at least once by demand. The valid outstanding interval is defined until the FB deallocation by one of the following ways: from FB allocation, if FB is allocated by demand; from the demand Hit FB, if it is allocated by hardware or software prefetch. Note: In the L1D, a Demand Read contains cacheable or noncacheable demand loads, including ones causing cache-line splits and reads due to page walks resulted from any request type. ++ 0x1 extra: pending_cycles This event counts duration of L1D miss outstanding in cycles. ++name:dtlb_store_misses type:exclusive default:0x1 ++ 0x1 extra: miss_causes_a_walk This event counts store misses in all DTLB levels that cause page walks of any page size (4K/2M/4M/1G). ++ 0x2 extra: walk_completed_4k This event counts store misses in all DTLB levels that cause a completed page walk (4K page size). The page walk can end with or without a fault. ++ 0x10 extra: walk_duration This event counts the number of cycles while PMH is busy with the page walk. ++ 0x20 extra: stlb_hit_4k Store misses that miss the DTLB and hit the STLB (4K) ++ 0xe extra: walk_completed Store misses in all DTLB levels that cause completed page walks ++ 0x60 extra: stlb_hit Store operations that miss the first TLB level but hit the second and do not cause page walks ++name:tx_mem type:exclusive default:0x1 ++ 0x1 extra: abort_conflict Number of times a TSX line had a cache conflict ++ 0x2 extra: abort_capacity_write Number of times a TSX Abort was triggered due to an evicted line caused by a transaction overflow ++ 0x4 extra: abort_hle_store_to_elided_lock Number of times a TSX Abort was triggered due to a non-release/commit store to lock ++ 0x8 extra: abort_hle_elision_buffer_not_empty Number of times a TSX Abort was triggered due to commit but Lock Buffer not empty ++ 0x10 extra: abort_hle_elision_buffer_mismatch Number of times a TSX Abort was triggered due to release/commit but data and address mismatch ++ 0x20 extra: abort_hle_elision_buffer_unsupported_alignment Number of times a TSX Abort was triggered due to attempting an unsupported alignment from Lock Buffer ++ 0x40 extra: hle_elision_buffer_full Number of times we could not allocate Lock Buffer ++name:move_elimination type:exclusive default:0x1 ++ 0x1 extra: int_eliminated Number of integer Move Elimination candidate uops that were eliminated. ++ 0x2 extra: simd_eliminated Number of SIMD Move Elimination candidate uops that were eliminated. ++ 0x4 extra: int_not_eliminated Number of integer Move Elimination candidate uops that were not eliminated. ++ 0x8 extra: simd_not_eliminated Number of SIMD Move Elimination candidate uops that were not eliminated. ++name:cpl_cycles type:exclusive default:0x1 ++ 0x1 extra: ring0 This event counts the unhalted core cycles during which the thread is in the ring 0 privileged mode. ++ 0x2 extra: ring123 This event counts unhalted core cycles during which the thread is in rings 1, 2, or 3. ++ 0x1 extra:edge ring0_trans This event counts when there is a transition from ring 1,2 or 3 to ring0. ++name:tx_exec type:exclusive default:0x1 ++ 0x1 extra: misc1 Unfriendly TSX abort triggered by a flowmarker ++ 0x2 extra: misc2 Unfriendly TSX abort triggered by a vzeroupper instruction ++ 0x4 extra: misc3 Unfriendly TSX abort triggered by a nest count that is too deep ++ 0x8 extra: misc4 RTM region detected inside HLE ++ 0x10 extra: misc5 # HLE inside HLE+ ++name:rs_events type:exclusive default:0x1 ++ 0x1 extra: empty_cycles This event counts cycles during which the reservation station (RS) is empty for the thread. Note: In ST-mode, not active thread should drive 0. This is usually caused by severely costly branch mispredictions, or allocator/FE issues. ++ 0x1 extra:inv,edge empty_end Counts end of periods where the Reservation Station (RS) was empty. Could be useful to precisely locate Frontend Latency Bound issues. ++name:offcore_requests_outstanding type:exclusive default:0x1 ++ 0x1 extra: demand_data_rd This event counts the number of offcore outstanding Demand Data Read transactions in the super queue (SQ) every cycle. A transaction is considered to be in the Offcore outstanding state between L2 miss and transaction completion sent to requestor. See the corresponding Umask under OFFCORE_REQUESTS. Note: A prefetch promoted to Demand is counted from the promotion point. ++ 0x2 extra: demand_code_rd This event counts the number of offcore outstanding Code Reads transactions in the super queue every cycle. The "Offcore outstanding" state of the transaction lasts from the L2 miss until the sending transaction completion to requestor (SQ deallocation). See the corresponding Umask under OFFCORE_REQUESTS. ++ 0x4 extra: demand_rfo This event counts the number of offcore outstanding RFO (store) transactions in the super queue (SQ) every cycle. A transaction is considered to be in the Offcore outstanding state between L2 miss and transaction completion sent to requestor (SQ de-allocation). See corresponding Umask under OFFCORE_REQUESTS. ++ 0x8 extra: all_data_rd This event counts the number of offcore outstanding cacheable Core Data Read transactions in the super queue every cycle. A transaction is considered to be in the Offcore outstanding state between L2 miss and transaction completion sent to requestor (SQ de-allocation). See corresponding Umask under OFFCORE_REQUESTS. ++ 0x1 extra: cycles_with_demand_data_rd This event counts cycles when offcore outstanding Demand Data Read transactions are present in the super queue (SQ). A transaction is considered to be in the Offcore outstanding state between L2 miss and transaction completion sent to requestor (SQ de-allocation). ++ 0x8 extra: cycles_with_data_rd This event counts cycles when offcore outstanding cacheable Core Data Read transactions are present in the super queue. A transaction is considered to be in the Offcore outstanding state between L2 miss and transaction completion sent to requestor (SQ de-allocation). See corresponding Umask under OFFCORE_REQUESTS. ++name:lock_cycles type:exclusive default:0x1 ++ 0x1 extra: split_lock_uc_lock_duration This event counts cycles in which the L1 and L2 are locked due to a UC lock or split lock. A lock is asserted in case of locked memory access, due to noncacheable memory, locked operation that spans two cache lines, or a page walk from the noncacheable page table. L1D and L2 locks have a very high performance penalty and it is highly recommended to avoid such access. ++ 0x2 extra: cache_lock_duration This event counts the number of cycles when the L1D is locked. It is a superset of the 0x1 mask (BUS_LOCK_CLOCKS.BUS_LOCK_DURATION). ++name:idq type:exclusive default:0x2 ++ 0x2 extra: empty This counts the number of cycles that the instruction decoder queue is empty and can indicate that the application may be bound in the front end. It does not determine whether there are uops being delivered to the Alloc stage since uops can be delivered by bypass skipping the Instruction Decode Queue (IDQ) when it is empty. ++ 0x4 extra: mite_uops This event counts the number of uops delivered to Instruction Decode Queue (IDQ) from the MITE path. Counting includes uops that may "bypass" the IDQ. This also means that uops are not being delivered from the Decode Stream Buffer (DSB). ++ 0x8 extra: dsb_uops This event counts the number of uops delivered to Instruction Decode Queue (IDQ) from the Decode Stream Buffer (DSB) path. Counting includes uops that may "bypass" the IDQ. ++ 0x10 extra: ms_dsb_uops This event counts the number of uops initiated by Decode Stream Buffer (DSB) that are being delivered to Instruction Decode Queue (IDQ) while the Microcode Sequencer (MS) is busy. Counting includes uops that may "bypass" the IDQ. ++ 0x20 extra: ms_mite_uops This event counts the number of uops initiated by MITE and delivered to Instruction Decode Queue (IDQ) while the Microcode Sequenser (MS) is busy. Counting includes uops that may "bypass" the IDQ. ++ 0x30 extra: ms_uops This event counts the total number of uops delivered to Instruction Decode Queue (IDQ) while the Microcode Sequenser (MS) is busy. Counting includes uops that may "bypass" the IDQ. Uops maybe initiated by Decode Stream Buffer (DSB) or MITE. ++ 0x30 extra: ms_cycles This event counts cycles during which uops are being delivered to Instruction Decode Queue (IDQ) while the Microcode Sequenser (MS) is busy. Counting includes uops that may "bypass" the IDQ. Uops maybe initiated by Decode Stream Buffer (DSB) or MITE. ++ 0x4 extra: mite_cycles This event counts cycles during which uops are being delivered to Instruction Decode Queue (IDQ) from the MITE path. Counting includes uops that may "bypass" the IDQ. ++ 0x8 extra: dsb_cycles This event counts cycles during which uops are being delivered to Instruction Decode Queue (IDQ) from the Decode Stream Buffer (DSB) path. Counting includes uops that may "bypass" the IDQ. ++ 0x10 extra: ms_dsb_cycles This event counts cycles during which uops initiated by Decode Stream Buffer (DSB) are being delivered to Instruction Decode Queue (IDQ) while the Microcode Sequencer (MS) is busy. Counting includes uops that may "bypass" the IDQ. ++ 0x10 extra:edge ms_dsb_occur This event counts the number of deliveries to Instruction Decode Queue (IDQ) initiated by Decode Stream Buffer (DSB) while the Microcode Sequencer (MS) is busy. Counting includes uops that may "bypass" the IDQ. ++ 0x18 extra: all_dsb_cycles_4_uops This event counts the number of cycles 4 uops were delivered to Instruction Decode Queue (IDQ) from the Decode Stream Buffer (DSB) path. Counting includes uops that may "bypass" the IDQ. ++ 0x18 extra: all_dsb_cycles_any_uops This event counts the number of cycles uops were delivered to Instruction Decode Queue (IDQ) from the Decode Stream Buffer (DSB) path. Counting includes uops that may "bypass" the IDQ. ++ 0x24 extra: all_mite_cycles_4_uops This event counts the number of cycles 4 uops were delivered to Instruction Decode Queue (IDQ) from the MITE path. Counting includes uops that may "bypass" the IDQ. This also means that uops are not being delivered from the Decode Stream Buffer (DSB). ++ 0x24 extra: all_mite_cycles_any_uops This event counts the number of cycles uops were delivered to Instruction Decode Queue (IDQ) from the MITE path. Counting includes uops that may "bypass" the IDQ. This also means that uops are not being delivered from the Decode Stream Buffer (DSB). ++ 0x3c extra: mite_all_uops This event counts the number of uops delivered to Instruction Decode Queue (IDQ) from the MITE path. Counting includes uops that may "bypass" the IDQ. This also means that uops are not being delivered from the Decode Stream Buffer (DSB). ++ 0x30 extra:edge ms_switches Number of switches from DSB (Decode Stream Buffer) or MITE (legacy decode pipeline) to the Microcode Sequencer ++name:itlb_misses type:exclusive default:0x1 ++ 0x1 extra: miss_causes_a_walk This event counts store misses in all DTLB levels that cause page walks of any page size (4K/2M/4M/1G). ++ 0x2 extra: walk_completed_4k This event counts store misses in all DTLB levels that cause a completed page walk (4K page size). The page walk can end with or without a fault. ++ 0x10 extra: walk_duration This event counts the number of cycles while PMH is busy with the page walk. ++ 0x20 extra: stlb_hit_4k Core misses that miss the DTLB and hit the STLB (4K) ++ 0xe extra: walk_completed Misses in all ITLB levels that cause completed page walks ++ 0x60 extra: stlb_hit Operations that miss the first ITLB level but hit the second and do not cause any page walks ++name:br_inst_exec type:exclusive default:0xff ++ 0xff extra: all_branches This event counts both taken and not taken speculative and retired branch instructions. ++ 0x41 extra: nontaken_conditional This event counts not taken macro-conditional branch instructions. ++ 0x81 extra: taken_conditional This event counts taken speculative and retired macro-conditional branch instructions. ++ 0x82 extra: taken_direct_jump This event counts taken speculative and retired macro-conditional branch instructions excluding calls and indirect branches. ++ 0x84 extra: taken_indirect_jump_non_call_ret This event counts taken speculative and retired indirect branches excluding calls and return branches. ++ 0x88 extra: taken_indirect_near_return This event counts taken speculative and retired indirect branches that have a return mnemonic. ++ 0x90 extra: taken_direct_near_call This event counts taken speculative and retired direct near calls. ++ 0xa0 extra: taken_indirect_near_call This event counts taken speculative and retired indirect calls including both register and memory indirect. ++ 0xc1 extra: all_conditional This event counts both taken and not taken speculative and retired macro-conditional branch instructions. ++ 0xc2 extra: all_direct_jmp This event counts both taken and not taken speculative and retired macro-unconditional branch instructions, excluding calls and indirects. ++ 0xc4 extra: all_indirect_jump_non_call_ret This event counts both taken and not taken speculative and retired indirect branches excluding calls and return branches. ++ 0xc8 extra: all_indirect_near_return This event counts both taken and not taken speculative and retired indirect branches that have a return mnemonic. ++ 0xd0 extra: all_direct_near_call This event counts both taken and not taken speculative and retired direct near calls. ++name:br_misp_exec type:exclusive default:0xff ++ 0xff extra: all_branches This event counts both taken and not taken speculative and retired mispredicted branch instructions. ++ 0x41 extra: nontaken_conditional This event counts not taken speculative and retired mispredicted macro conditional branch instructions. ++ 0x81 extra: taken_conditional This event counts taken speculative and retired mispredicted macro conditional branch instructions. ++ 0x84 extra: taken_indirect_jump_non_call_ret This event counts taken speculative and retired mispredicted indirect branches excluding calls and returns. ++ 0xc1 extra: all_conditional This event counts both taken and not taken speculative and retired mispredicted macro conditional branch instructions. ++ 0xc4 extra: all_indirect_jump_non_call_ret This event counts both taken and not taken mispredicted indirect branches excluding calls and returns. ++ 0xa0 extra: taken_indirect_near_call Taken speculative and retired mispredicted indirect calls ++name:idq_uops_not_delivered type:exclusive default:0x1 ++ 0x1 extra: core This event counts the number of uops not delivered to Resource Allocation Table (RAT) per thread adding ?4 ? x? when Resource Allocation Table (RAT) is not stalled and Instruction Decode Queue (IDQ) delivers x uops to Resource Allocation Table (RAT) (where x belongs to {0,1,2,3}). Counting does not cover cases when: a. IDQ-Resource Allocation Table (RAT) pipe serves the other thread; b. Resource Allocation Table (RAT) is stalled for the thread (including uop drops and clear BE conditions); c. Instruction Decode Queue (IDQ) delivers four uops. ++ 0x1 extra: cycles_0_uops_deliv_core This event counts, on the per-thread basis, cycles when no uops are delivered to Resource Allocation Table (RAT). IDQ_Uops_Not_Delivered.core =4. ++ 0x1 extra: cycles_le_1_uop_deliv_core This event counts, on the per-thread basis, cycles when less than 1 uop is delivered to Resource Allocation Table (RAT). IDQ_Uops_Not_Delivered.core >=3. ++ 0x1 extra: cycles_le_2_uop_deliv_core Cycles with less than 2 uops delivered by the front end ++ 0x1 extra: cycles_le_3_uop_deliv_core Cycles with less than 3 uops delivered by the front end ++ 0x1 extra:inv cycles_fe_was_ok Counts cycles FE delivered 4 uops or Resource Allocation Table (RAT) was stalling FE. ++name:uops_executed_port type:exclusive default:0x1 ++ 0x1 extra:any port_0_core Cycles per core when uops are exectuted in port 0 ++ 0x2 extra:any port_1_core Cycles per core when uops are exectuted in port 1 ++ 0x4 extra:any port_2_core Cycles per core when uops are dispatched to port 2 ++ 0x8 extra:any port_3_core Cycles per core when uops are dispatched to port 3 ++ 0x10 extra:any port_4_core Cycles per core when uops are exectuted in port 4 ++ 0x20 extra:any port_5_core Cycles per core when uops are exectuted in port 5 ++ 0x40 extra:any port_6_core Cycles per core when uops are exectuted in port 6 ++ 0x80 extra:any port_7_core Cycles per core when uops are dispatched to port 7 ++ 0x1 extra: port_0 This event counts, on the per-thread basis, cycles during which uops are dispatched from the Reservation Station (RS) to port 0. ++ 0x2 extra: port_1 This event counts, on the per-thread basis, cycles during which uops are dispatched from the Reservation Station (RS) to port 1. ++ 0x4 extra: port_2 This event counts, on the per-thread basis, cycles during which uops are dispatched from the Reservation Station (RS) to port 2. ++ 0x8 extra: port_3 This event counts, on the per-thread basis, cycles during which uops are dispatched from the Reservation Station (RS) to port 3. ++ 0x10 extra: port_4 This event counts, on the per-thread basis, cycles during which uops are dispatched from the Reservation Station (RS) to port 4. ++ 0x20 extra: port_5 This event counts, on the per-thread basis, cycles during which uops are dispatched from the Reservation Station (RS) to port 5. ++ 0x40 extra: port_6 This event counts, on the per-thread basis, cycles during which uops are dispatched from the Reservation Station (RS) to port 6. ++ 0x80 extra: port_7 This event counts, on the per-thread basis, cycles during which uops are dispatched from the Reservation Station (RS) to port 7. ++name:uops_dispatched_port type:exclusive default:0x1 ++ 0x1 extra: port_0 This event counts, on the per-thread basis, cycles during which uops are dispatched from the Reservation Station (RS) to port 0. ++ 0x2 extra: port_1 This event counts, on the per-thread basis, cycles during which uops are dispatched from the Reservation Station (RS) to port 1. ++ 0x4 extra: port_2 This event counts, on the per-thread basis, cycles during which uops are dispatched from the Reservation Station (RS) to port 2. ++ 0x8 extra: port_3 This event counts, on the per-thread basis, cycles during which uops are dispatched from the Reservation Station (RS) to port 3. ++ 0x10 extra: port_4 This event counts, on the per-thread basis, cycles during which uops are dispatched from the Reservation Station (RS) to port 4. ++ 0x20 extra: port_5 This event counts, on the per-thread basis, cycles during which uops are dispatched from the Reservation Station (RS) to port 5. ++ 0x40 extra: port_6 This event counts, on the per-thread basis, cycles during which uops are dispatched from the Reservation Station (RS) to port 6. ++ 0x80 extra: port_7 This event counts, on the per-thread basis, cycles during which uops are dispatched from the Reservation Station (RS) to port 7. ++name:resource_stalls type:exclusive default:0x1 ++ 0x1 extra: any This event counts resource-related stall cycles. Reasons for stalls can be as follows: - *any* u-arch structure got full (LB, SB, RS, ROB, BOB, LM, Physical Register Reclaim Table (PRRT), or Physical History Table (PHT) slots) - *any* u-arch structure got empty (like INT/SIMD FreeLists) - FPU control word (FPCW), MXCSR and others. This counts cycles that the pipeline backend blocked uop delivery from the front end. ++ 0x4 extra: rs This event counts stall cycles caused by absence of eligible entries in the reservation station (RS). This may result from RS overflow, or from RS deallocation because of the RS array Write Port allocation scheme (each RS entry has two write ports instead of four. As a result, empty entries could not be used, although RS is not really full). This counts cycles that the pipeline backend blocked uop delivery from the front end. ++ 0x8 extra: sb This event counts stall cycles caused by the store buffer (SB) overflow (excluding draining from synch). This counts cycles that the pipeline backend blocked uop delivery from the front end. ++ 0x10 extra: rob This event counts ROB full stall cycles. This counts cycles that the pipeline backend blocked uop delivery from the front end. ++name:cycle_activity type:exclusive default:0x1 ++ 0x1 extra: cycles_l2_pending Counts number of cycles the CPU has at least one pending demand* load request missing the L2 cache. ++ 0x8 extra: cycles_l1d_pending Counts number of cycles the CPU has at least one pending demand load request missing the L1 data cache. ++ 0x2 extra: cycles_ldm_pending Counts number of cycles the CPU has at least one pending demand load request (that is cycles with non-completed load waiting for its data from memory subsystem) ++ 0x4 extra: cycles_no_execute Counts number of cycles nothing is executed on any execution port. ++ 0x5 extra: stalls_l2_pending Counts number of cycles nothing is executed on any execution port, while there was at least one pending demand* load request missing the L2 cache. (as a footprint) * includes also L1 HW prefetch requests that may or may not be required by demands ++ 0x6 extra: stalls_ldm_pending Counts number of cycles nothing is executed on any execution port, while there was at least one pending demand load request. ++ 0xc extra: stalls_l1d_pending Counts number of cycles nothing is executed on any execution port, while there was at least one pending demand load request missing the L1 data cache. ++ 0x8 extra: cycles_l1d_miss Cycles while L1 cache miss demand load is outstanding. ++ 0x1 extra: cycles_l2_miss Cycles while L2 cache miss demand load is outstanding. ++ 0x2 extra: cycles_mem_any Cycles while memory subsystem has an outstanding load. ++ 0x4 extra: stalls_total Total execution stalls. ++ 0xc extra: stalls_l1d_miss Execution stalls while L1 cache miss demand load is outstanding. ++ 0x5 extra: stalls_l2_miss Execution stalls while L2 cache miss demand load is outstanding. ++ 0x6 extra: stalls_mem_any Execution stalls while memory subsystem has an outstanding load. ++name:lsd type:exclusive default:0x1 ++ 0x1 extra: uops Number of Uops delivered by the LSD. Read more on LSD under LSD_REPLAY.REPLAY ++ 0x1 extra: cycles_4_uops Cycles 4 Uops delivered by the LSD, but didn't come from the decoder ++ 0x1 extra: cycles_active Cycles Uops delivered by the LSD, but didn't come from the decoder ++name:offcore_requests type:exclusive default:0x1 ++ 0x1 extra: demand_data_rd This event counts the Demand Data Read requests sent to uncore. Use it in conjunction with OFFCORE_REQUESTS_OUTSTANDING to determine average latency in the uncore. ++ 0x2 extra: demand_code_rd This event counts both cacheable and noncachaeble code read requests. ++ 0x4 extra: demand_rfo This event counts the demand RFO (read for ownership) requests including regular RFOs, locks, ItoM. ++ 0x8 extra: all_data_rd This event counts the demand and prefetch data reads. All Core Data Reads include cacheable "Demands" and L2 prefetchers (not L3 prefetchers). Counting also covers reads due to page walks resulted from any request type. ++name:uops_executed type:exclusive default:0x1 ++ 0x1 extra: thread Number of uops to be executed per-thread each cycle. ++ 0x2 extra: core Number of uops executed from any thread ++ 0x1 extra:inv stall_cycles This event counts cycles during which no uops were dispatched from the Reservation Station (RS) per thread. ++ 0x1 extra: cycles_ge_1_uop_exec Cycles where at least 1 uop was executed per-thread ++ 0x1 extra: cycles_ge_2_uops_exec Cycles where at least 2 uops were executed per-thread ++ 0x1 extra: cycles_ge_3_uops_exec Cycles where at least 3 uops were executed per-thread ++ 0x1 extra: cycles_ge_4_uops_exec Cycles where at least 4 uops were executed per-thread ++name:page_walker_loads type:exclusive default:0x11 ++ 0x11 extra: dtlb_l1 Number of DTLB page walker hits in the L1+FB ++ 0x21 extra: itlb_l1 Number of ITLB page walker hits in the L1+FB ++ 0x12 extra: dtlb_l2 Number of DTLB page walker hits in the L2 ++ 0x22 extra: itlb_l2 Number of ITLB page walker hits in the L2 ++ 0x14 extra: dtlb_l3 Number of DTLB page walker hits in the L3 + XSNP ++ 0x24 extra: itlb_l3 Number of ITLB page walker hits in the L3 + XSNP ++ 0x18 extra: dtlb_memory Number of DTLB page walker hits in Memory ++name:inst_retired type:exclusive default:0x2 ++ 0x2 extra: x87 This is a non-precise version (that is, does not use PEBS) of the event that counts FP operations retired. For X87 FP operations that have no exceptions counting also includes flows that have several X87, or flows that use X87 uops in the exception handling. ++ 0x1 extra: prec_dist This is a precise version (that is, uses PEBS) of the event that counts instructions retired. ++name:other_assists type:exclusive default:0x8 ++ 0x8 extra: avx_to_sse This is a non-precise version (that is, does not use PEBS) of the event that counts the number of transitions from AVX-256 to legacy SSE when penalty is applicable. ++ 0x10 extra: sse_to_avx This is a non-precise version (that is, does not use PEBS) of the event that counts the number of transitions from legacy SSE to AVX-256 when penalty is applicable. ++ 0x40 extra: any_wb_assist Number of times any microcode assist is invoked by HW upon uop writeback. ++name:uops_retired type:exclusive default:0x1 ++ 0x1 extra: all This is a non-precise version (that is, does not use PEBS) of the event that counts all actually retired uops. Counting increments by two for micro-fused uops, and by one for macro-fused and other uops. Maximal increment value for one cycle is eight. ++ 0x2 extra: retire_slots This is a non-precise version (that is, does not use PEBS) of the event that counts the number of retirement slots used. ++ 0x1 extra:inv stall_cycles This is a non-precise version (that is, does not use PEBS) of the event that counts cycles without actually retired uops. ++ 0x1 extra:inv total_cycles Number of cycles using always true condition (uops_ret < 16) applied to non PEBS uops retired event. ++name:machine_clears type:exclusive default:0x1 ++ 0x1 extra: cycles This event counts both thread-specific (TS) and all-thread (AT) nukes. ++ 0x2 extra: memory_ordering This event counts the number of memory ordering Machine Clears detected. Memory Ordering Machine Clears can result from one of the following: 1. memory disambiguation, 2. external snoop, or 3. cross SMT-HW-thread snoop (stores) hitting load buffer. ++ 0x4 extra: smc This event counts self-modifying code (SMC) detected, which causes a machine clear. ++ 0x20 extra: maskmov Maskmov false fault - counts number of time ucode passes through Maskmov flow due to instruction's mask being 0 while the flow was completed without raising a fault. ++ 0x1 extra:edge count Number of machine clears (nukes) of any type. ++name:br_inst_retired type:exclusive default:0x1 ++ 0x1 extra: conditional This is a non-precise version (that is, does not use PEBS) of the event that counts conditional branch instructions retired. ++ 0x2 extra: near_call This is a non-precise version (that is, does not use PEBS) of the event that counts both direct and indirect near call instructions retired. ++ 0x8 extra: near_return This is a non-precise version (that is, does not use PEBS) of the event that counts return instructions retired. ++ 0x10 extra: not_taken This is a non-precise version (that is, does not use PEBS) of the event that counts not taken branch instructions retired. ++ 0x20 extra: near_taken This is a non-precise version (that is, does not use PEBS) of the event that counts taken branch instructions retired. ++ 0x40 extra: far_branch This is a non-precise version (that is, does not use PEBS) of the event that counts far branch instructions retired. ++ 0x4 extra: all_branches_pebs This is a precise version of BR_INST_RETIRED.ALL_BRANCHES that counts all (macro) branch instructions retired. ++name:br_misp_retired type:exclusive default:0x1 ++ 0x1 extra: conditional This is a non-precise version (that is, does not use PEBS) of the event that counts mispredicted conditional branch instructions retired. ++ 0x4 extra: all_branches_pebs This is a precise version of BR_MISP_RETIRED.ALL_BRANCHES that counts all mispredicted macro branch instructions retired. ++ 0x20 extra: near_taken number of near branch instructions retired that were mispredicted and taken. ++name:hle_retired type:exclusive default:0x1 ++ 0x1 extra: start Number of times we entered an HLE region; does not count nested transactions ++ 0x2 extra: commit Number of times HLE commit succeeded ++ 0x4 extra: aborted Number of times HLE abort was triggered ++ 0x8 extra: aborted_misc1 Number of times an HLE abort was attributed to a Memory condition (See TSX_Memory event for additional details) ++ 0x10 extra: aborted_misc2 Number of times the TSX watchdog signaled an HLE abort ++ 0x20 extra: aborted_misc3 Number of times a disallowed operation caused an HLE abort ++ 0x40 extra: aborted_misc4 Number of times HLE caused a fault ++ 0x80 extra: aborted_misc5 Number of times HLE aborted and was not due to the abort conditions in subevents 3-6 ++name:rtm_retired type:exclusive default:0x1 ++ 0x1 extra: start Number of times we entered an RTM region; does not count nested transactions ++ 0x2 extra: commit Number of times RTM commit succeeded ++ 0x4 extra: aborted Number of times RTM abort was triggered ++ 0x8 extra: aborted_misc1 Number of times an RTM abort was attributed to a Memory condition (See TSX_Memory event for additional details) ++ 0x10 extra: aborted_misc2 Number of times the TSX watchdog signaled an RTM abort ++ 0x20 extra: aborted_misc3 Number of times a disallowed operation caused an RTM abort ++ 0x40 extra: aborted_misc4 Number of times a RTM caused a fault ++ 0x80 extra: aborted_misc5 Number of times RTM aborted and was not due to the abort conditions in subevents 3-6 ++name:fp_assist type:exclusive default:0x1e ++ 0x1e extra: any This event counts cycles with any input and output SSE or x87 FP assist. If an input and output assist are detected on the same cycle the event increments by 1. ++ 0x2 extra: x87_output This is a non-precise version (that is, does not use PEBS) of the event that counts the number of x87 floating point (FP) micro-code assist (numeric overflow/underflow, inexact result) when the output value (destination register) is invalid. ++ 0x4 extra: x87_input This is a non-precise version (that is, does not use PEBS) of the event that counts x87 floating point (FP) micro-code assist (invalid operation, denormal operand, SNaN operand) when the input value (one of the source operands to an FP instruction) is invalid. ++ 0x8 extra: simd_output This is a non-precise version (that is, does not use PEBS) of the event that counts the number of SSE* floating point (FP) micro-code assist (numeric overflow/underflow) when the output value (destination register) is invalid. Counting covers only cases involving penalties that require micro-code assist intervention. ++ 0x10 extra: simd_input This is a non-precise version (that is, does not use PEBS) of the event that counts any input SSE* FP assist - invalid operation, denormal operand, dividing by zero, SNaN operand. Counting includes only cases involving penalties that required micro-code assist intervention. ++name:mem_uops_retired type:exclusive default:0x11 ++ 0x11 extra: stlb_miss_loads This is a non-precise version (that is, does not use PEBS) of the event that counts load uops with true STLB miss retired to the architected path. True STLB miss is an uop triggering page walk that gets completed without blocks, and later gets retired. This page walk can end up with or without a fault. ++ 0x12 extra: stlb_miss_stores This is a non-precise version (that is, does not use PEBS) of the event that counts store uops with true STLB miss retired to the architected path. True STLB miss is an uop triggering page walk that gets completed without blocks, and later gets retired. This page walk can end up with or without a fault. ++ 0x21 extra: lock_loads This is a non-precise version (that is, does not use PEBS) of the event that counts load uops with locked access retired to the architected path. ++ 0x41 extra: split_loads This is a non-precise version (that is, does not use PEBS) of the event that counts line-splitted load uops retired to the architected path. A line split is across 64B cache-line which includes a page split (4K). ++ 0x42 extra: split_stores This is a non-precise version (that is, does not use PEBS) of the event that counts line-splitted store uops retired to the architected path. A line split is across 64B cache-line which includes a page split (4K). ++ 0x81 extra: all_loads This is a non-precise version (that is, does not use PEBS) of the event that counts load uops retired to the architected path with a filter on bits 0 and 1 applied. Note: This event ?ounts AVX-256bit load/store double-pump memory uops as a single uop at retirement. This event also counts SW prefetches. ++ 0x82 extra: all_stores This is a non-precise version (that is, does not use PEBS) of the event that counts store uops retired to the architected path with a filter on bits 0 and 1 applied. Note: This event ?ounts AVX-256bit load/store double-pump memory uops as a single uop at retirement. ++name:mem_load_uops_retired type:exclusive default:0x1 ++ 0x1 extra: l1_hit This is a non-precise version (that is, does not use PEBS) of the event that counts retired load uops which data sources were hits in the nearest-level (L1) cache. Note: Only two data-sources of L1/FB are applicable for AVX-256bit even though the corresponding AVX load could be serviced by a deeper level in the memory hierarchy. Data source is reported for the Low-half load. This event also counts SW prefetches independent of the actual data source ++ 0x2 extra: l2_hit This is a non-precise version (that is, does not use PEBS) of the event that counts retired load uops which data sources were hits in the mid-level (L2) cache. ++ 0x4 extra: l3_hit This is a non-precise version (that is, does not use PEBS) of the event that counts retired load uops which data sources were data hits in the last-level (L3) cache without snoops required. ++ 0x8 extra: l1_miss This is a non-precise version (that is, does not use PEBS) of the event that counts retired load uops which data sources were misses in the nearest-level (L1) cache. Counting excludes unknown and UC data source. ++ 0x10 extra: l2_miss This is a non-precise version (that is, does not use PEBS) of the event that counts retired load uops which data sources were misses in the mid-level (L2) cache. Counting excludes unknown and UC data source. ++ 0x20 extra: l3_miss Miss in last-level (L3) cache. Excludes Unknown data-source. ++ 0x40 extra: hit_lfb This is a non-precise version (that is, does not use PEBS) of the event that counts retired load uops which data sources were load uops missed L1 but hit a fill buffer due to a preceding miss to the same cache line with the data not ready. Note: Only two data-sources of L1/FB are applicable for AVX-256bit even though the corresponding AVX load could be serviced by a deeper level in the memory hierarchy. Data source is reported for the Low-half load. ++name:mem_load_uops_l3_hit_retired type:exclusive default:0x1 ++ 0x1 extra: xsnp_miss This is a non-precise version (that is, does not use PEBS) of the event that counts retired load uops which data sources were L3 Hit and a cross-core snoop missed in the on-pkg core cache. ++ 0x2 extra: xsnp_hit This is a non-precise version (that is, does not use PEBS) of the event that counts retired load uops which data sources were L3 hit and a cross-core snoop hit in the on-pkg core cache. ++ 0x4 extra: xsnp_hitm This is a non-precise version (that is, does not use PEBS) of the event that counts retired load uops which data sources were HitM responses from a core on same socket (shared L3). ++ 0x8 extra: xsnp_none This is a non-precise version (that is, does not use PEBS) of the event that counts retired load uops which data sources were hits in the last-level (L3) cache without snoops required. ++name:l2_trans type:exclusive default:0x80 ++ 0x80 extra: all_requests This event counts transactions that access the L2 pipe including snoops, pagewalks, and so on. ++ 0x1 extra: demand_data_rd This event counts Demand Data Read requests that access L2 cache, including rejects. ++ 0x2 extra: rfo This event counts Read for Ownership (RFO) requests that access L2 cache. ++ 0x4 extra: code_rd This event counts the number of L2 cache accesses when fetching instructions. ++ 0x8 extra: all_pf This event counts L2 or L3 HW prefetches that access L2 cache including rejects. ++ 0x10 extra: l1d_wb This event counts L1D writebacks that access L2 cache. ++ 0x20 extra: l2_fill This event counts L2 fill requests that access L2 cache. ++ 0x40 extra: l2_wb This event counts L2 writebacks that access L2 cache. ++name:l2_lines_in type:exclusive default:0x7 ++ 0x7 extra: all This event counts the number of L2 cache lines filling the L2. Counting does not cover rejects. ++ 0x1 extra: i This event counts the number of L2 cache lines in the Invalidate state filling the L2. Counting does not cover rejects. ++ 0x2 extra: s This event counts the number of L2 cache lines in the Shared state filling the L2. Counting does not cover rejects. ++ 0x4 extra: e This event counts the number of L2 cache lines in the Exclusive state filling the L2. Counting does not cover rejects. +diff --git a/libop/op_cpu_type.c b/libop/op_cpu_type.c +index 0cfb4ea..bce230a 100644 +--- a/libop/op_cpu_type.c ++++ b/libop/op_cpu_type.c +@@ -130,6 +130,7 @@ static struct cpu_descr const cpu_descrs[MAX_CPU_TYPE] = { + { "IBM Power Architected Events V1", "ppc64/architected_events_v1", CPU_PPC64_ARCH_V1, 6 }, + { "ppc64 POWER8", "ppc64/power8", CPU_PPC64_POWER8, 6 }, + { "Intel Silvermont microarchitecture", "i386/silvermont", CPU_SILVERMONT, 2 }, ++ { "Intel Broadwell microarchitecture", "i386/broadwell", CPU_BROADWELL, 4 }, + }; + + static size_t const nr_cpu_descrs = sizeof(cpu_descrs) / sizeof(struct cpu_descr); +@@ -670,6 +671,7 @@ op_cpu op_cpu_base_type(op_cpu cpu_type) + case CPU_ATOM: + case CPU_NEHALEM: + case CPU_HASWELL: ++ case CPU_BROADWELL: + case CPU_SILVERMONT: + case CPU_WESTMERE: + case CPU_SANDYBRIDGE: +diff --git a/libop/op_cpu_type.h b/libop/op_cpu_type.h +index 7c478ad..3754156 100644 +--- a/libop/op_cpu_type.h ++++ b/libop/op_cpu_type.h +@@ -110,6 +110,7 @@ typedef enum { + CPU_PPC64_ARCH_V1, /** < IBM Power architected events version 1 */ + CPU_PPC64_POWER8, /**< ppc64 POWER8 family */ + CPU_SILVERMONT, /** < Intel Silvermont microarchitecture */ ++ CPU_BROADWELL, /** < Intel Broadwell (Core-M) microarchitecture */ + MAX_CPU_TYPE + } op_cpu; + +diff --git a/libop/op_events.c b/libop/op_events.c +index 968ff04..9c27e6c 100644 +--- a/libop/op_events.c ++++ b/libop/op_events.c +@@ -1201,6 +1201,7 @@ void op_default_event(op_cpu cpu_type, struct op_default_event_descr * descr) + case CPU_CORE_I7: + case CPU_NEHALEM: + case CPU_HASWELL: ++ case CPU_BROADWELL: + case CPU_SILVERMONT: + case CPU_WESTMERE: + case CPU_SANDYBRIDGE: +diff --git a/libop/op_hw_specific.h b/libop/op_hw_specific.h +index e86dcae..1d39692 100644 +--- a/libop/op_hw_specific.h ++++ b/libop/op_hw_specific.h +@@ -148,8 +148,11 @@ static inline op_cpu op_cpu_specific_type(op_cpu cpu_type) + case 0x3f: + case 0x45: + case 0x46: +- case 0x47: + return CPU_HASWELL; ++ case 0x3d: ++ case 0x47: ++ case 0x4f: ++ return CPU_BROADWELL; + case 0x37: + case 0x4d: + return CPU_SILVERMONT; +diff --git a/utils/ophelp.c b/utils/ophelp.c +index 35f47bc..bf3fbcb 100644 +--- a/utils/ophelp.c ++++ b/utils/ophelp.c +@@ -555,6 +555,7 @@ int main(int argc, char const * argv[]) + case CPU_CORE_I7: + case CPU_NEHALEM: + case CPU_HASWELL: ++ case CPU_BROADWELL: + case CPU_SILVERMONT: + case CPU_WESTMERE: + case CPU_SANDYBRIDGE: +commit 5ce12ed9d20a91f19cba6e8ecadc478fcd57db6c +Author: Andi Kleen +Date: Thu Jul 17 12:45:09 2014 -0500 + + Fix some problems in the Broadwell events + + Fix some problems in the previous commit of the Broadwell events. + Most flags were missing due to a bug in the generation script. + This patch also re-adds proper PEBS events. + + Signed-off-by: Andi Kleen + +diff --git a/events/i386/broadwell/events b/events/i386/broadwell/events +index 6a4b388..ec55836 100644 +--- a/events/i386/broadwell/events ++++ b/events/i386/broadwell/events +@@ -58,7 +58,7 @@ event:0xcc counters:cpuid um:x20 minimum:2000003 name:rob_misc_events_lbr_insert + event:0xd0 counters:0,1,2,3 um:mem_uops_retired minimum:2000003 name:mem_uops_retired : + event:0xd1 counters:0,1,2,3 um:mem_load_uops_retired minimum:2000003 name:mem_load_uops_retired : + event:0xd2 counters:0,1,2,3 um:mem_load_uops_l3_hit_retired minimum:100003 name:mem_load_uops_l3_hit_retired : +-event:0xd3 counters:0,1,2,3 um:one minimum:100007 name:mem_load_uops_l3_miss_retired_local_dram : ++event:0xd3 counters:0,1,2,3 um:mem_load_uops_l3_miss_retired minimum:100007 name:mem_load_uops_l3_miss_retired : + event:0xe6 counters:cpuid um:x1f minimum:100003 name:baclears_any : + event:0xf0 counters:cpuid um:l2_trans minimum:200003 name:l2_trans : + event:0xf1 counters:cpuid um:l2_lines_in minimum:100003 name:l2_lines_in : +diff --git a/events/i386/broadwell/unit_masks b/events/i386/broadwell/unit_masks +index 470e9e9..0d6ccd5 100644 +--- a/events/i386/broadwell/unit_masks ++++ b/events/i386/broadwell/unit_masks +@@ -36,7 +36,7 @@ name:uops_issued type:exclusive default:0x1 + 0x10 extra: flags_merge Number of flags-merge uops being allocated. Such uops considered perf sensitive; added by GSR u-arch. + 0x20 extra: slow_lea Number of slow LEA uops being allocated. A uop is generally considered SlowLea if it has 3 sources (e.g. 2 sources + immediate) regardless if as a result of LEA instruction or not. + 0x40 extra: single_mul Number of Multiply packed/scalar single precision uops allocated +- 0x1 extra:inv stall_cycles This event counts cycles during which the Resource Allocation Table (RAT) does not issue any Uops to the reservation station (RS) for the current thread. ++ 0x1 extra:cmask=1,inv stall_cycles This event counts cycles during which the Resource Allocation Table (RAT) does not issue any Uops to the reservation station (RS) for the current thread. + name:l2_rqsts type:exclusive default:0x21 + 0x21 extra: demand_data_rd_miss This event counts the number of demand Data Read requests that miss L2 cache. Only not rejected loads are counted. + 0x41 extra: demand_data_rd_hit This event counts the number of demand Data Read requests that hit L2 cache. Only not rejected loads are counted. +@@ -56,7 +56,7 @@ name:l2_rqsts type:exclusive default:0x21 + 0xff extra: references All L2 requests + name:l1d_pend_miss type:exclusive default:0x1 + 0x1 extra: pending This event counts duration of L1D miss outstanding, that is each cycle number of Fill Buffers (FB) outstanding required by Demand Reads. FB either is held by demand loads, or it is held by non-demand loads and gets hit at least once by demand. The valid outstanding interval is defined until the FB deallocation by one of the following ways: from FB allocation, if FB is allocated by demand; from the demand Hit FB, if it is allocated by hardware or software prefetch. Note: In the L1D, a Demand Read contains cacheable or noncacheable demand loads, including ones causing cache-line splits and reads due to page walks resulted from any request type. +- 0x1 extra: pending_cycles This event counts duration of L1D miss outstanding in cycles. ++ 0x1 extra:cmask=1 pending_cycles This event counts duration of L1D miss outstanding in cycles. + name:dtlb_store_misses type:exclusive default:0x1 + 0x1 extra: miss_causes_a_walk This event counts store misses in all DTLB levels that cause page walks of any page size (4K/2M/4M/1G). + 0x2 extra: walk_completed_4k This event counts store misses in all DTLB levels that cause a completed page walk (4K page size). The page walk can end with or without a fault. +@@ -80,7 +80,7 @@ name:move_elimination type:exclusive default:0x1 + name:cpl_cycles type:exclusive default:0x1 + 0x1 extra: ring0 This event counts the unhalted core cycles during which the thread is in the ring 0 privileged mode. + 0x2 extra: ring123 This event counts unhalted core cycles during which the thread is in rings 1, 2, or 3. +- 0x1 extra:edge ring0_trans This event counts when there is a transition from ring 1,2 or 3 to ring0. ++ 0x1 extra:cmask=1,edge ring0_trans This event counts when there is a transition from ring 1,2 or 3 to ring0. + name:tx_exec type:exclusive default:0x1 + 0x1 extra: misc1 Unfriendly TSX abort triggered by a flowmarker + 0x2 extra: misc2 Unfriendly TSX abort triggered by a vzeroupper instruction +@@ -89,14 +89,14 @@ name:tx_exec type:exclusive default:0x1 + 0x10 extra: misc5 # HLE inside HLE+ + name:rs_events type:exclusive default:0x1 + 0x1 extra: empty_cycles This event counts cycles during which the reservation station (RS) is empty for the thread. Note: In ST-mode, not active thread should drive 0. This is usually caused by severely costly branch mispredictions, or allocator/FE issues. +- 0x1 extra:inv,edge empty_end Counts end of periods where the Reservation Station (RS) was empty. Could be useful to precisely locate Frontend Latency Bound issues. ++ 0x1 extra:cmask=1,inv,edge empty_end Counts end of periods where the Reservation Station (RS) was empty. Could be useful to precisely locate Frontend Latency Bound issues. + name:offcore_requests_outstanding type:exclusive default:0x1 + 0x1 extra: demand_data_rd This event counts the number of offcore outstanding Demand Data Read transactions in the super queue (SQ) every cycle. A transaction is considered to be in the Offcore outstanding state between L2 miss and transaction completion sent to requestor. See the corresponding Umask under OFFCORE_REQUESTS. Note: A prefetch promoted to Demand is counted from the promotion point. + 0x2 extra: demand_code_rd This event counts the number of offcore outstanding Code Reads transactions in the super queue every cycle. The "Offcore outstanding" state of the transaction lasts from the L2 miss until the sending transaction completion to requestor (SQ deallocation). See the corresponding Umask under OFFCORE_REQUESTS. + 0x4 extra: demand_rfo This event counts the number of offcore outstanding RFO (store) transactions in the super queue (SQ) every cycle. A transaction is considered to be in the Offcore outstanding state between L2 miss and transaction completion sent to requestor (SQ de-allocation). See corresponding Umask under OFFCORE_REQUESTS. + 0x8 extra: all_data_rd This event counts the number of offcore outstanding cacheable Core Data Read transactions in the super queue every cycle. A transaction is considered to be in the Offcore outstanding state between L2 miss and transaction completion sent to requestor (SQ de-allocation). See corresponding Umask under OFFCORE_REQUESTS. +- 0x1 extra: cycles_with_demand_data_rd This event counts cycles when offcore outstanding Demand Data Read transactions are present in the super queue (SQ). A transaction is considered to be in the Offcore outstanding state between L2 miss and transaction completion sent to requestor (SQ de-allocation). +- 0x8 extra: cycles_with_data_rd This event counts cycles when offcore outstanding cacheable Core Data Read transactions are present in the super queue. A transaction is considered to be in the Offcore outstanding state between L2 miss and transaction completion sent to requestor (SQ de-allocation). See corresponding Umask under OFFCORE_REQUESTS. ++ 0x1 extra:cmask=1 cycles_with_demand_data_rd This event counts cycles when offcore outstanding Demand Data Read transactions are present in the super queue (SQ). A transaction is considered to be in the Offcore outstanding state between L2 miss and transaction completion sent to requestor (SQ de-allocation). ++ 0x8 extra:cmask=1 cycles_with_data_rd This event counts cycles when offcore outstanding cacheable Core Data Read transactions are present in the super queue. A transaction is considered to be in the Offcore outstanding state between L2 miss and transaction completion sent to requestor (SQ de-allocation). See corresponding Umask under OFFCORE_REQUESTS. + name:lock_cycles type:exclusive default:0x1 + 0x1 extra: split_lock_uc_lock_duration This event counts cycles in which the L1 and L2 are locked due to a UC lock or split lock. A lock is asserted in case of locked memory access, due to noncacheable memory, locked operation that spans two cache lines, or a page walk from the noncacheable page table. L1D and L2 locks have a very high performance penalty and it is highly recommended to avoid such access. + 0x2 extra: cache_lock_duration This event counts the number of cycles when the L1D is locked. It is a superset of the 0x1 mask (BUS_LOCK_CLOCKS.BUS_LOCK_DURATION). +@@ -107,17 +107,17 @@ name:idq type:exclusive default:0x2 + 0x10 extra: ms_dsb_uops This event counts the number of uops initiated by Decode Stream Buffer (DSB) that are being delivered to Instruction Decode Queue (IDQ) while the Microcode Sequencer (MS) is busy. Counting includes uops that may "bypass" the IDQ. + 0x20 extra: ms_mite_uops This event counts the number of uops initiated by MITE and delivered to Instruction Decode Queue (IDQ) while the Microcode Sequenser (MS) is busy. Counting includes uops that may "bypass" the IDQ. + 0x30 extra: ms_uops This event counts the total number of uops delivered to Instruction Decode Queue (IDQ) while the Microcode Sequenser (MS) is busy. Counting includes uops that may "bypass" the IDQ. Uops maybe initiated by Decode Stream Buffer (DSB) or MITE. +- 0x30 extra: ms_cycles This event counts cycles during which uops are being delivered to Instruction Decode Queue (IDQ) while the Microcode Sequenser (MS) is busy. Counting includes uops that may "bypass" the IDQ. Uops maybe initiated by Decode Stream Buffer (DSB) or MITE. +- 0x4 extra: mite_cycles This event counts cycles during which uops are being delivered to Instruction Decode Queue (IDQ) from the MITE path. Counting includes uops that may "bypass" the IDQ. +- 0x8 extra: dsb_cycles This event counts cycles during which uops are being delivered to Instruction Decode Queue (IDQ) from the Decode Stream Buffer (DSB) path. Counting includes uops that may "bypass" the IDQ. +- 0x10 extra: ms_dsb_cycles This event counts cycles during which uops initiated by Decode Stream Buffer (DSB) are being delivered to Instruction Decode Queue (IDQ) while the Microcode Sequencer (MS) is busy. Counting includes uops that may "bypass" the IDQ. +- 0x10 extra:edge ms_dsb_occur This event counts the number of deliveries to Instruction Decode Queue (IDQ) initiated by Decode Stream Buffer (DSB) while the Microcode Sequencer (MS) is busy. Counting includes uops that may "bypass" the IDQ. +- 0x18 extra: all_dsb_cycles_4_uops This event counts the number of cycles 4 uops were delivered to Instruction Decode Queue (IDQ) from the Decode Stream Buffer (DSB) path. Counting includes uops that may "bypass" the IDQ. +- 0x18 extra: all_dsb_cycles_any_uops This event counts the number of cycles uops were delivered to Instruction Decode Queue (IDQ) from the Decode Stream Buffer (DSB) path. Counting includes uops that may "bypass" the IDQ. +- 0x24 extra: all_mite_cycles_4_uops This event counts the number of cycles 4 uops were delivered to Instruction Decode Queue (IDQ) from the MITE path. Counting includes uops that may "bypass" the IDQ. This also means that uops are not being delivered from the Decode Stream Buffer (DSB). +- 0x24 extra: all_mite_cycles_any_uops This event counts the number of cycles uops were delivered to Instruction Decode Queue (IDQ) from the MITE path. Counting includes uops that may "bypass" the IDQ. This also means that uops are not being delivered from the Decode Stream Buffer (DSB). ++ 0x30 extra:cmask=1 ms_cycles This event counts cycles during which uops are being delivered to Instruction Decode Queue (IDQ) while the Microcode Sequenser (MS) is busy. Counting includes uops that may "bypass" the IDQ. Uops maybe initiated by Decode Stream Buffer (DSB) or MITE. ++ 0x4 extra:cmask=1 mite_cycles This event counts cycles during which uops are being delivered to Instruction Decode Queue (IDQ) from the MITE path. Counting includes uops that may "bypass" the IDQ. ++ 0x8 extra:cmask=1 dsb_cycles This event counts cycles during which uops are being delivered to Instruction Decode Queue (IDQ) from the Decode Stream Buffer (DSB) path. Counting includes uops that may "bypass" the IDQ. ++ 0x10 extra:cmask=1 ms_dsb_cycles This event counts cycles during which uops initiated by Decode Stream Buffer (DSB) are being delivered to Instruction Decode Queue (IDQ) while the Microcode Sequencer (MS) is busy. Counting includes uops that may "bypass" the IDQ. ++ 0x10 extra:cmask=1,edge ms_dsb_occur This event counts the number of deliveries to Instruction Decode Queue (IDQ) initiated by Decode Stream Buffer (DSB) while the Microcode Sequencer (MS) is busy. Counting includes uops that may "bypass" the IDQ. ++ 0x18 extra:cmask=4 all_dsb_cycles_4_uops This event counts the number of cycles 4 uops were delivered to Instruction Decode Queue (IDQ) from the Decode Stream Buffer (DSB) path. Counting includes uops that may "bypass" the IDQ. ++ 0x18 extra:cmask=1 all_dsb_cycles_any_uops This event counts the number of cycles uops were delivered to Instruction Decode Queue (IDQ) from the Decode Stream Buffer (DSB) path. Counting includes uops that may "bypass" the IDQ. ++ 0x24 extra:cmask=4 all_mite_cycles_4_uops This event counts the number of cycles 4 uops were delivered to Instruction Decode Queue (IDQ) from the MITE path. Counting includes uops that may "bypass" the IDQ. This also means that uops are not being delivered from the Decode Stream Buffer (DSB). ++ 0x24 extra:cmask=1 all_mite_cycles_any_uops This event counts the number of cycles uops were delivered to Instruction Decode Queue (IDQ) from the MITE path. Counting includes uops that may "bypass" the IDQ. This also means that uops are not being delivered from the Decode Stream Buffer (DSB). + 0x3c extra: mite_all_uops This event counts the number of uops delivered to Instruction Decode Queue (IDQ) from the MITE path. Counting includes uops that may "bypass" the IDQ. This also means that uops are not being delivered from the Decode Stream Buffer (DSB). +- 0x30 extra:edge ms_switches Number of switches from DSB (Decode Stream Buffer) or MITE (legacy decode pipeline) to the Microcode Sequencer ++ 0x30 extra:cmask=1,edge ms_switches Number of switches from DSB (Decode Stream Buffer) or MITE (legacy decode pipeline) to the Microcode Sequencer + name:itlb_misses type:exclusive default:0x1 + 0x1 extra: miss_causes_a_walk This event counts store misses in all DTLB levels that cause page walks of any page size (4K/2M/4M/1G). + 0x2 extra: walk_completed_4k This event counts store misses in all DTLB levels that cause a completed page walk (4K page size). The page walk can end with or without a fault. +@@ -149,11 +149,11 @@ name:br_misp_exec type:exclusive default:0xff + 0xa0 extra: taken_indirect_near_call Taken speculative and retired mispredicted indirect calls + name:idq_uops_not_delivered type:exclusive default:0x1 + 0x1 extra: core This event counts the number of uops not delivered to Resource Allocation Table (RAT) per thread adding ?4 ? x? when Resource Allocation Table (RAT) is not stalled and Instruction Decode Queue (IDQ) delivers x uops to Resource Allocation Table (RAT) (where x belongs to {0,1,2,3}). Counting does not cover cases when: a. IDQ-Resource Allocation Table (RAT) pipe serves the other thread; b. Resource Allocation Table (RAT) is stalled for the thread (including uop drops and clear BE conditions); c. Instruction Decode Queue (IDQ) delivers four uops. +- 0x1 extra: cycles_0_uops_deliv_core This event counts, on the per-thread basis, cycles when no uops are delivered to Resource Allocation Table (RAT). IDQ_Uops_Not_Delivered.core =4. +- 0x1 extra: cycles_le_1_uop_deliv_core This event counts, on the per-thread basis, cycles when less than 1 uop is delivered to Resource Allocation Table (RAT). IDQ_Uops_Not_Delivered.core >=3. +- 0x1 extra: cycles_le_2_uop_deliv_core Cycles with less than 2 uops delivered by the front end +- 0x1 extra: cycles_le_3_uop_deliv_core Cycles with less than 3 uops delivered by the front end +- 0x1 extra:inv cycles_fe_was_ok Counts cycles FE delivered 4 uops or Resource Allocation Table (RAT) was stalling FE. ++ 0x1 extra:cmask=4 cycles_0_uops_deliv_core This event counts, on the per-thread basis, cycles when no uops are delivered to Resource Allocation Table (RAT). IDQ_Uops_Not_Delivered.core =4. ++ 0x1 extra:cmask=3 cycles_le_1_uop_deliv_core This event counts, on the per-thread basis, cycles when less than 1 uop is delivered to Resource Allocation Table (RAT). IDQ_Uops_Not_Delivered.core >=3. ++ 0x1 extra:cmask=2 cycles_le_2_uop_deliv_core Cycles with less than 2 uops delivered by the front end ++ 0x1 extra:cmask=1 cycles_le_3_uop_deliv_core Cycles with less than 3 uops delivered by the front end ++ 0x1 extra:cmask=1,inv cycles_fe_was_ok Counts cycles FE delivered 4 uops or Resource Allocation Table (RAT) was stalling FE. + name:uops_executed_port type:exclusive default:0x1 + 0x1 extra:any port_0_core Cycles per core when uops are exectuted in port 0 + 0x2 extra:any port_1_core Cycles per core when uops are exectuted in port 1 +@@ -186,24 +186,24 @@ name:resource_stalls type:exclusive default:0x1 + 0x8 extra: sb This event counts stall cycles caused by the store buffer (SB) overflow (excluding draining from synch). This counts cycles that the pipeline backend blocked uop delivery from the front end. + 0x10 extra: rob This event counts ROB full stall cycles. This counts cycles that the pipeline backend blocked uop delivery from the front end. + name:cycle_activity type:exclusive default:0x1 +- 0x1 extra: cycles_l2_pending Counts number of cycles the CPU has at least one pending demand* load request missing the L2 cache. +- 0x8 extra: cycles_l1d_pending Counts number of cycles the CPU has at least one pending demand load request missing the L1 data cache. +- 0x2 extra: cycles_ldm_pending Counts number of cycles the CPU has at least one pending demand load request (that is cycles with non-completed load waiting for its data from memory subsystem) +- 0x4 extra: cycles_no_execute Counts number of cycles nothing is executed on any execution port. +- 0x5 extra: stalls_l2_pending Counts number of cycles nothing is executed on any execution port, while there was at least one pending demand* load request missing the L2 cache. (as a footprint) * includes also L1 HW prefetch requests that may or may not be required by demands +- 0x6 extra: stalls_ldm_pending Counts number of cycles nothing is executed on any execution port, while there was at least one pending demand load request. +- 0xc extra: stalls_l1d_pending Counts number of cycles nothing is executed on any execution port, while there was at least one pending demand load request missing the L1 data cache. +- 0x8 extra: cycles_l1d_miss Cycles while L1 cache miss demand load is outstanding. +- 0x1 extra: cycles_l2_miss Cycles while L2 cache miss demand load is outstanding. +- 0x2 extra: cycles_mem_any Cycles while memory subsystem has an outstanding load. +- 0x4 extra: stalls_total Total execution stalls. +- 0xc extra: stalls_l1d_miss Execution stalls while L1 cache miss demand load is outstanding. +- 0x5 extra: stalls_l2_miss Execution stalls while L2 cache miss demand load is outstanding. +- 0x6 extra: stalls_mem_any Execution stalls while memory subsystem has an outstanding load. ++ 0x1 extra:cmask=1 cycles_l2_pending Counts number of cycles the CPU has at least one pending demand* load request missing the L2 cache. ++ 0x8 extra:cmask=8 cycles_l1d_pending Counts number of cycles the CPU has at least one pending demand load request missing the L1 data cache. ++ 0x2 extra:cmask=2 cycles_ldm_pending Counts number of cycles the CPU has at least one pending demand load request (that is cycles with non-completed load waiting for its data from memory subsystem) ++ 0x4 extra:cmask=4 cycles_no_execute Counts number of cycles nothing is executed on any execution port. ++ 0x5 extra:cmask=5 stalls_l2_pending Counts number of cycles nothing is executed on any execution port, while there was at least one pending demand* load request missing the L2 cache. (as a footprint) * includes also L1 HW prefetch requests that may or may not be required by demands ++ 0x6 extra:cmask=6 stalls_ldm_pending Counts number of cycles nothing is executed on any execution port, while there was at least one pending demand load request. ++ 0xc extra:cmask=c stalls_l1d_pending Counts number of cycles nothing is executed on any execution port, while there was at least one pending demand load request missing the L1 data cache. ++ 0x8 extra:cmask=8 cycles_l1d_miss Cycles while L1 cache miss demand load is outstanding. ++ 0x1 extra:cmask=1 cycles_l2_miss Cycles while L2 cache miss demand load is outstanding. ++ 0x2 extra:cmask=2 cycles_mem_any Cycles while memory subsystem has an outstanding load. ++ 0x4 extra:cmask=4 stalls_total Total execution stalls. ++ 0xc extra:cmask=c stalls_l1d_miss Execution stalls while L1 cache miss demand load is outstanding. ++ 0x5 extra:cmask=5 stalls_l2_miss Execution stalls while L2 cache miss demand load is outstanding. ++ 0x6 extra:cmask=6 stalls_mem_any Execution stalls while memory subsystem has an outstanding load. + name:lsd type:exclusive default:0x1 + 0x1 extra: uops Number of Uops delivered by the LSD. Read more on LSD under LSD_REPLAY.REPLAY +- 0x1 extra: cycles_4_uops Cycles 4 Uops delivered by the LSD, but didn't come from the decoder +- 0x1 extra: cycles_active Cycles Uops delivered by the LSD, but didn't come from the decoder ++ 0x1 extra:cmask=4 cycles_4_uops Cycles 4 Uops delivered by the LSD, but didn't come from the decoder ++ 0x1 extra:cmask=1 cycles_active Cycles Uops delivered by the LSD, but didn't come from the decoder + name:offcore_requests type:exclusive default:0x1 + 0x1 extra: demand_data_rd This event counts the Demand Data Read requests sent to uncore. Use it in conjunction with OFFCORE_REQUESTS_OUTSTANDING to determine average latency in the uncore. + 0x2 extra: demand_code_rd This event counts both cacheable and noncachaeble code read requests. +@@ -212,11 +212,11 @@ name:offcore_requests type:exclusive default:0x1 + name:uops_executed type:exclusive default:0x1 + 0x1 extra: thread Number of uops to be executed per-thread each cycle. + 0x2 extra: core Number of uops executed from any thread +- 0x1 extra:inv stall_cycles This event counts cycles during which no uops were dispatched from the Reservation Station (RS) per thread. +- 0x1 extra: cycles_ge_1_uop_exec Cycles where at least 1 uop was executed per-thread +- 0x1 extra: cycles_ge_2_uops_exec Cycles where at least 2 uops were executed per-thread +- 0x1 extra: cycles_ge_3_uops_exec Cycles where at least 3 uops were executed per-thread +- 0x1 extra: cycles_ge_4_uops_exec Cycles where at least 4 uops were executed per-thread ++ 0x1 extra:cmask=1,inv stall_cycles This event counts cycles during which no uops were dispatched from the Reservation Station (RS) per thread. ++ 0x1 extra:cmask=1 cycles_ge_1_uop_exec Cycles where at least 1 uop was executed per-thread ++ 0x1 extra:cmask=2 cycles_ge_2_uops_exec Cycles where at least 2 uops were executed per-thread ++ 0x1 extra:cmask=3 cycles_ge_3_uops_exec Cycles where at least 3 uops were executed per-thread ++ 0x1 extra:cmask=4 cycles_ge_4_uops_exec Cycles where at least 4 uops were executed per-thread + name:page_walker_loads type:exclusive default:0x11 + 0x11 extra: dtlb_l1 Number of DTLB page walker hits in the L1+FB + 0x21 extra: itlb_l1 Number of ITLB page walker hits in the L1+FB +@@ -227,38 +227,47 @@ name:page_walker_loads type:exclusive default:0x11 + 0x18 extra: dtlb_memory Number of DTLB page walker hits in Memory + name:inst_retired type:exclusive default:0x2 + 0x2 extra: x87 This is a non-precise version (that is, does not use PEBS) of the event that counts FP operations retired. For X87 FP operations that have no exceptions counting also includes flows that have several X87, or flows that use X87 uops in the exception handling. +- 0x1 extra: prec_dist This is a precise version (that is, uses PEBS) of the event that counts instructions retired. ++ 0x1 extra:pebs prec_dist This is a precise version (that is, uses PEBS) of the event that counts instructions retired. + name:other_assists type:exclusive default:0x8 + 0x8 extra: avx_to_sse This is a non-precise version (that is, does not use PEBS) of the event that counts the number of transitions from AVX-256 to legacy SSE when penalty is applicable. + 0x10 extra: sse_to_avx This is a non-precise version (that is, does not use PEBS) of the event that counts the number of transitions from legacy SSE to AVX-256 when penalty is applicable. + 0x40 extra: any_wb_assist Number of times any microcode assist is invoked by HW upon uop writeback. + name:uops_retired type:exclusive default:0x1 + 0x1 extra: all This is a non-precise version (that is, does not use PEBS) of the event that counts all actually retired uops. Counting increments by two for micro-fused uops, and by one for macro-fused and other uops. Maximal increment value for one cycle is eight. ++ 0x1 extra: all_pebs Counts all actually retired uops. Counting increments by two for micro-fused uops, and by one for macro-fused and other uops. Maximal increment value for one cycle is eight. + 0x2 extra: retire_slots This is a non-precise version (that is, does not use PEBS) of the event that counts the number of retirement slots used. +- 0x1 extra:inv stall_cycles This is a non-precise version (that is, does not use PEBS) of the event that counts cycles without actually retired uops. +- 0x1 extra:inv total_cycles Number of cycles using always true condition (uops_ret < 16) applied to non PEBS uops retired event. ++ 0x2 extra: retire_slots_pebs Counts the number of retirement slots used. ++ 0x1 extra:cmask=1,inv stall_cycles This is a non-precise version (that is, does not use PEBS) of the event that counts cycles without actually retired uops. ++ 0x1 extra:cmask=a,inv total_cycles Number of cycles using always true condition (uops_ret < 16) applied to non PEBS uops retired event. + name:machine_clears type:exclusive default:0x1 + 0x1 extra: cycles This event counts both thread-specific (TS) and all-thread (AT) nukes. + 0x2 extra: memory_ordering This event counts the number of memory ordering Machine Clears detected. Memory Ordering Machine Clears can result from one of the following: 1. memory disambiguation, 2. external snoop, or 3. cross SMT-HW-thread snoop (stores) hitting load buffer. + 0x4 extra: smc This event counts self-modifying code (SMC) detected, which causes a machine clear. + 0x20 extra: maskmov Maskmov false fault - counts number of time ucode passes through Maskmov flow due to instruction's mask being 0 while the flow was completed without raising a fault. +- 0x1 extra:edge count Number of machine clears (nukes) of any type. ++ 0x1 extra:cmask=1,edge count Number of machine clears (nukes) of any type. + name:br_inst_retired type:exclusive default:0x1 + 0x1 extra: conditional This is a non-precise version (that is, does not use PEBS) of the event that counts conditional branch instructions retired. ++ 0x1 extra: conditional_pebs Counts conditional branch instructions retired. + 0x2 extra: near_call This is a non-precise version (that is, does not use PEBS) of the event that counts both direct and indirect near call instructions retired. ++ 0x2 extra: near_call_pebs Counts both direct and indirect near call instructions retired. + 0x8 extra: near_return This is a non-precise version (that is, does not use PEBS) of the event that counts return instructions retired. ++ 0x8 extra: near_return_pebs Counts return instructions retired. + 0x10 extra: not_taken This is a non-precise version (that is, does not use PEBS) of the event that counts not taken branch instructions retired. + 0x20 extra: near_taken This is a non-precise version (that is, does not use PEBS) of the event that counts taken branch instructions retired. ++ 0x20 extra: near_taken_pebs Counts taken branch instructions retired. + 0x40 extra: far_branch This is a non-precise version (that is, does not use PEBS) of the event that counts far branch instructions retired. +- 0x4 extra: all_branches_pebs This is a precise version of BR_INST_RETIRED.ALL_BRANCHES that counts all (macro) branch instructions retired. ++ 0x4 extra:pebs all_branches_pebs This is a precise version of BR_INST_RETIRED.ALL_BRANCHES that counts all (macro) branch instructions retired. + name:br_misp_retired type:exclusive default:0x1 + 0x1 extra: conditional This is a non-precise version (that is, does not use PEBS) of the event that counts mispredicted conditional branch instructions retired. +- 0x4 extra: all_branches_pebs This is a precise version of BR_MISP_RETIRED.ALL_BRANCHES that counts all mispredicted macro branch instructions retired. ++ 0x1 extra: conditional_pebs Counts mispredicted conditional branch instructions retired. ++ 0x4 extra:pebs all_branches_pebs This is a precise version of BR_MISP_RETIRED.ALL_BRANCHES that counts all mispredicted macro branch instructions retired. + 0x20 extra: near_taken number of near branch instructions retired that were mispredicted and taken. ++ 0x20 extra: near_taken_pebs number of near branch instructions retired that were mispredicted and taken. + name:hle_retired type:exclusive default:0x1 + 0x1 extra: start Number of times we entered an HLE region; does not count nested transactions + 0x2 extra: commit Number of times HLE commit succeeded + 0x4 extra: aborted Number of times HLE abort was triggered ++ 0x4 extra: aborted_pebs Number of times HLE abort was triggered + 0x8 extra: aborted_misc1 Number of times an HLE abort was attributed to a Memory condition (See TSX_Memory event for additional details) + 0x10 extra: aborted_misc2 Number of times the TSX watchdog signaled an HLE abort + 0x20 extra: aborted_misc3 Number of times a disallowed operation caused an HLE abort +@@ -268,38 +277,60 @@ name:rtm_retired type:exclusive default:0x1 + 0x1 extra: start Number of times we entered an RTM region; does not count nested transactions + 0x2 extra: commit Number of times RTM commit succeeded + 0x4 extra: aborted Number of times RTM abort was triggered ++ 0x4 extra: aborted_pebs Number of times RTM abort was triggered + 0x8 extra: aborted_misc1 Number of times an RTM abort was attributed to a Memory condition (See TSX_Memory event for additional details) + 0x10 extra: aborted_misc2 Number of times the TSX watchdog signaled an RTM abort + 0x20 extra: aborted_misc3 Number of times a disallowed operation caused an RTM abort + 0x40 extra: aborted_misc4 Number of times a RTM caused a fault + 0x80 extra: aborted_misc5 Number of times RTM aborted and was not due to the abort conditions in subevents 3-6 + name:fp_assist type:exclusive default:0x1e +- 0x1e extra: any This event counts cycles with any input and output SSE or x87 FP assist. If an input and output assist are detected on the same cycle the event increments by 1. ++ 0x1e extra:cmask=1 any This event counts cycles with any input and output SSE or x87 FP assist. If an input and output assist are detected on the same cycle the event increments by 1. + 0x2 extra: x87_output This is a non-precise version (that is, does not use PEBS) of the event that counts the number of x87 floating point (FP) micro-code assist (numeric overflow/underflow, inexact result) when the output value (destination register) is invalid. + 0x4 extra: x87_input This is a non-precise version (that is, does not use PEBS) of the event that counts x87 floating point (FP) micro-code assist (invalid operation, denormal operand, SNaN operand) when the input value (one of the source operands to an FP instruction) is invalid. + 0x8 extra: simd_output This is a non-precise version (that is, does not use PEBS) of the event that counts the number of SSE* floating point (FP) micro-code assist (numeric overflow/underflow) when the output value (destination register) is invalid. Counting covers only cases involving penalties that require micro-code assist intervention. + 0x10 extra: simd_input This is a non-precise version (that is, does not use PEBS) of the event that counts any input SSE* FP assist - invalid operation, denormal operand, dividing by zero, SNaN operand. Counting includes only cases involving penalties that required micro-code assist intervention. + name:mem_uops_retired type:exclusive default:0x11 + 0x11 extra: stlb_miss_loads This is a non-precise version (that is, does not use PEBS) of the event that counts load uops with true STLB miss retired to the architected path. True STLB miss is an uop triggering page walk that gets completed without blocks, and later gets retired. This page walk can end up with or without a fault. ++ 0x11 extra: stlb_miss_loads_pebs Counts load uops with true STLB miss retired to the architected path. True STLB miss is an uop triggering page walk that gets completed without blocks, and later gets retired. This page walk can end up with or without a fault. + 0x12 extra: stlb_miss_stores This is a non-precise version (that is, does not use PEBS) of the event that counts store uops with true STLB miss retired to the architected path. True STLB miss is an uop triggering page walk that gets completed without blocks, and later gets retired. This page walk can end up with or without a fault. ++ 0x12 extra: stlb_miss_stores_pebs Counts store uops with true STLB miss retired to the architected path. True STLB miss is an uop triggering page walk that gets completed without blocks, and later gets retired. This page walk can end up with or without a fault. + 0x21 extra: lock_loads This is a non-precise version (that is, does not use PEBS) of the event that counts load uops with locked access retired to the architected path. ++ 0x21 extra: lock_loads_pebs Counts load uops with locked access retired to the architected path. + 0x41 extra: split_loads This is a non-precise version (that is, does not use PEBS) of the event that counts line-splitted load uops retired to the architected path. A line split is across 64B cache-line which includes a page split (4K). ++ 0x41 extra: split_loads_pebs Counts line-splitted load uops retired to the architected path. A line split is across 64B cache-line which includes a page split (4K). + 0x42 extra: split_stores This is a non-precise version (that is, does not use PEBS) of the event that counts line-splitted store uops retired to the architected path. A line split is across 64B cache-line which includes a page split (4K). ++ 0x42 extra: split_stores_pebs Counts line-splitted store uops retired to the architected path. A line split is across 64B cache-line which includes a page split (4K). + 0x81 extra: all_loads This is a non-precise version (that is, does not use PEBS) of the event that counts load uops retired to the architected path with a filter on bits 0 and 1 applied. Note: This event ?ounts AVX-256bit load/store double-pump memory uops as a single uop at retirement. This event also counts SW prefetches. ++ 0x81 extra: all_loads_pebs Counts load uops retired to the architected path with a filter on bits 0 and 1 applied. Note: This event ?ounts AVX-256bit load/store double-pump memory uops as a single uop at retirement. This event also counts SW prefetches. + 0x82 extra: all_stores This is a non-precise version (that is, does not use PEBS) of the event that counts store uops retired to the architected path with a filter on bits 0 and 1 applied. Note: This event ?ounts AVX-256bit load/store double-pump memory uops as a single uop at retirement. ++ 0x82 extra: all_stores_pebs Counts store uops retired to the architected path with a filter on bits 0 and 1 applied. Note: This event ?ounts AVX-256bit load/store double-pump memory uops as a single uop at retirement. + name:mem_load_uops_retired type:exclusive default:0x1 + 0x1 extra: l1_hit This is a non-precise version (that is, does not use PEBS) of the event that counts retired load uops which data sources were hits in the nearest-level (L1) cache. Note: Only two data-sources of L1/FB are applicable for AVX-256bit even though the corresponding AVX load could be serviced by a deeper level in the memory hierarchy. Data source is reported for the Low-half load. This event also counts SW prefetches independent of the actual data source ++ 0x1 extra: l1_hit_pebs Counts retired load uops which data sources were hits in the nearest-level (L1) cache. Note: Only two data-sources of L1/FB are applicable for AVX-256bit even though the corresponding AVX load could be serviced by a deeper level in the memory hierarchy. Data source is reported for the Low-half load. This event also counts SW prefetches independent of the actual data source + 0x2 extra: l2_hit This is a non-precise version (that is, does not use PEBS) of the event that counts retired load uops which data sources were hits in the mid-level (L2) cache. ++ 0x2 extra: l2_hit_pebs Counts retired load uops which data sources were hits in the mid-level (L2) cache. + 0x4 extra: l3_hit This is a non-precise version (that is, does not use PEBS) of the event that counts retired load uops which data sources were data hits in the last-level (L3) cache without snoops required. ++ 0x4 extra: l3_hit_pebs Counts retired load uops which data sources were data hits in the last-level (L3) cache without snoops required. + 0x8 extra: l1_miss This is a non-precise version (that is, does not use PEBS) of the event that counts retired load uops which data sources were misses in the nearest-level (L1) cache. Counting excludes unknown and UC data source. ++ 0x8 extra: l1_miss_pebs Counts retired load uops which data sources were misses in the nearest-level (L1) cache. Counting excludes unknown and UC data source. + 0x10 extra: l2_miss This is a non-precise version (that is, does not use PEBS) of the event that counts retired load uops which data sources were misses in the mid-level (L2) cache. Counting excludes unknown and UC data source. ++ 0x10 extra: l2_miss_pebs Counts retired load uops which data sources were misses in the mid-level (L2) cache. Counting excludes unknown and UC data source. + 0x20 extra: l3_miss Miss in last-level (L3) cache. Excludes Unknown data-source. ++ 0x20 extra: l3_miss_pebs Miss in last-level (L3) cache. Excludes Unknown data-source. + 0x40 extra: hit_lfb This is a non-precise version (that is, does not use PEBS) of the event that counts retired load uops which data sources were load uops missed L1 but hit a fill buffer due to a preceding miss to the same cache line with the data not ready. Note: Only two data-sources of L1/FB are applicable for AVX-256bit even though the corresponding AVX load could be serviced by a deeper level in the memory hierarchy. Data source is reported for the Low-half load. ++ 0x40 extra: hit_lfb_pebs Counts retired load uops which data sources were load uops missed L1 but hit a fill buffer due to a preceding miss to the same cache line with the data not ready. Note: Only two data-sources of L1/FB are applicable for AVX-256bit even though the corresponding AVX load could be serviced by a deeper level in the memory hierarchy. Data source is reported for the Low-half load. + name:mem_load_uops_l3_hit_retired type:exclusive default:0x1 + 0x1 extra: xsnp_miss This is a non-precise version (that is, does not use PEBS) of the event that counts retired load uops which data sources were L3 Hit and a cross-core snoop missed in the on-pkg core cache. ++ 0x1 extra: xsnp_miss_pebs Counts retired load uops which data sources were L3 Hit and a cross-core snoop missed in the on-pkg core cache. + 0x2 extra: xsnp_hit This is a non-precise version (that is, does not use PEBS) of the event that counts retired load uops which data sources were L3 hit and a cross-core snoop hit in the on-pkg core cache. ++ 0x2 extra: xsnp_hit_pebs Counts retired load uops which data sources were L3 hit and a cross-core snoop hit in the on-pkg core cache. + 0x4 extra: xsnp_hitm This is a non-precise version (that is, does not use PEBS) of the event that counts retired load uops which data sources were HitM responses from a core on same socket (shared L3). ++ 0x4 extra: xsnp_hitm_pebs Counts retired load uops which data sources were HitM responses from a core on same socket (shared L3). + 0x8 extra: xsnp_none This is a non-precise version (that is, does not use PEBS) of the event that counts retired load uops which data sources were hits in the last-level (L3) cache without snoops required. ++ 0x8 extra: xsnp_none_pebs Counts retired load uops which data sources were hits in the last-level (L3) cache without snoops required. ++name:mem_load_uops_l3_miss_retired type:exclusive default:0x1 ++ 0x1 extra: local_dram Retired load uop whose Data Source was: local DRAM either Snoop not needed or Snoop Miss (RspI) ++ 0x1 extra: local_dram_pebs Retired load uop whose Data Source was: local DRAM either Snoop not needed or Snoop Miss (RspI) + name:l2_trans type:exclusive default:0x80 + 0x80 extra: all_requests This event counts transactions that access the L2 pipe including snoops, pagewalks, and so on. + 0x1 extra: demand_data_rd This event counts Demand Data Read requests that access L2 cache, including rejects. +commit 893c18c2a2ba955bc77140bbd7696cc2d3f6e1dc +Author: Andi Kleen +Date: Thu Jul 17 12:55:42 2014 -0500 + + Improve error message for non-unique unit mask + + For the case where the user does not specify a UM and the default UM + is a non-unique hex value, the error message printed is the following: + + Default unit mask not supported for this event. + Please specify a unit mask by name, using the first word of the unit mask description. + + For cases where the user wrongly specifies a non-unique hex value for a UM + when they should have specified it by name, the message will be like the + following example: + + Unit mask (0x1) is non unique. + Please specify a unit mask by name, using the first word of the unit mask description. + + Signed-off-by: Andi Kleen + +diff --git a/libop/op_events.c b/libop/op_events.c +index 9c27e6c..b8900a5 100644 +--- a/libop/op_events.c ++++ b/libop/op_events.c +@@ -1389,6 +1389,7 @@ static void do_resolve_unit_mask(struct op_event *e, + if (pe->unit_mask_name == NULL) { + /* For numerical unit mask */ + int found = 0; ++ int old_um_valid = pe->unit_mask_valid; + + /* Use default unitmask if not specified */ + if (!pe->unit_mask_valid) { +@@ -1404,9 +1405,16 @@ static void do_resolve_unit_mask(struct op_event *e, + found++; + } + if (found > 1) { +- fprintf(stderr, "Unit mask (0x%x) is non unique.\n" +- "Please specify the unit mask using the first " +- "word of the description\n", ++ if (!old_um_valid) ++ fprintf(stderr, ++ "Default unit mask not supported for this event.\n" ++ "Please speicfy a unit mask by name, using the first " ++ "word of the unit mask description\n"); ++ else ++ fprintf(stderr, ++ "Unit mask (0x%x) is non unique.\n" ++ "Please specify the unit mask using the first " ++ "word of the description\n", + pe->unit_mask); + exit(EXIT_FAILURE); + } +commit 62e7814e8467230d8e283992ee6532d5f794359a +Author: Michael Petlan +Date: Thu Jun 11 11:24:51 2015 -0400 + + Fix default unit masks for Intel Broadwell + + Since some of the default unit masks for Intel Broadwell events cannot be + uniquely specified by numbers, the defaults have had to be replaced + by the named ones. When the affected events are used on Broadwell without + specifying unit masks after applying this patch, the default masks + are chosen correctly. + + Signed-off-by: William Cohen + +diff --git a/events/i386/broadwell/unit_masks b/events/i386/broadwell/unit_masks +index 0d6ccd5..4e69363 100644 +--- a/events/i386/broadwell/unit_masks ++++ b/events/i386/broadwell/unit_masks +@@ -31,7 +31,7 @@ name:dtlb_load_misses type:exclusive default:0x1 + 0x20 extra: stlb_hit_4k Load misses that miss the DTLB and hit the STLB (4K) + 0xe extra: walk_completed Demand load Miss in all translation lookaside buffer (TLB) levels causes a page walk that completes of any page size. + 0x60 extra: stlb_hit Load operations that miss the first DTLB level but hit the second and do not cause page walks +-name:uops_issued type:exclusive default:0x1 ++name:uops_issued type:exclusive default:any + 0x1 extra: any This event counts the number of Uops issued by the Resource Allocation Table (RAT) to the reservation station (RS). + 0x10 extra: flags_merge Number of flags-merge uops being allocated. Such uops considered perf sensitive; added by GSR u-arch. + 0x20 extra: slow_lea Number of slow LEA uops being allocated. A uop is generally considered SlowLea if it has 3 sources (e.g. 2 sources + immediate) regardless if as a result of LEA instruction or not. +@@ -54,7 +54,7 @@ name:l2_rqsts type:exclusive default:0x21 + 0xe7 extra: all_demand_references Demand requests to L2 cache + 0x3f extra: miss All requests that miss L2 cache + 0xff extra: references All L2 requests +-name:l1d_pend_miss type:exclusive default:0x1 ++name:l1d_pend_miss type:exclusive default:pending + 0x1 extra: pending This event counts duration of L1D miss outstanding, that is each cycle number of Fill Buffers (FB) outstanding required by Demand Reads. FB either is held by demand loads, or it is held by non-demand loads and gets hit at least once by demand. The valid outstanding interval is defined until the FB deallocation by one of the following ways: from FB allocation, if FB is allocated by demand; from the demand Hit FB, if it is allocated by hardware or software prefetch. Note: In the L1D, a Demand Read contains cacheable or noncacheable demand loads, including ones causing cache-line splits and reads due to page walks resulted from any request type. + 0x1 extra:cmask=1 pending_cycles This event counts duration of L1D miss outstanding in cycles. + name:dtlb_store_misses type:exclusive default:0x1 +@@ -77,7 +77,7 @@ name:move_elimination type:exclusive default:0x1 + 0x2 extra: simd_eliminated Number of SIMD Move Elimination candidate uops that were eliminated. + 0x4 extra: int_not_eliminated Number of integer Move Elimination candidate uops that were not eliminated. + 0x8 extra: simd_not_eliminated Number of SIMD Move Elimination candidate uops that were not eliminated. +-name:cpl_cycles type:exclusive default:0x1 ++name:cpl_cycles type:exclusive default:ring0 + 0x1 extra: ring0 This event counts the unhalted core cycles during which the thread is in the ring 0 privileged mode. + 0x2 extra: ring123 This event counts unhalted core cycles during which the thread is in rings 1, 2, or 3. + 0x1 extra:cmask=1,edge ring0_trans This event counts when there is a transition from ring 1,2 or 3 to ring0. +@@ -87,10 +87,10 @@ name:tx_exec type:exclusive default:0x1 + 0x4 extra: misc3 Unfriendly TSX abort triggered by a nest count that is too deep + 0x8 extra: misc4 RTM region detected inside HLE + 0x10 extra: misc5 # HLE inside HLE+ +-name:rs_events type:exclusive default:0x1 ++name:rs_events type:exclusive default:empty_cycles + 0x1 extra: empty_cycles This event counts cycles during which the reservation station (RS) is empty for the thread. Note: In ST-mode, not active thread should drive 0. This is usually caused by severely costly branch mispredictions, or allocator/FE issues. + 0x1 extra:cmask=1,inv,edge empty_end Counts end of periods where the Reservation Station (RS) was empty. Could be useful to precisely locate Frontend Latency Bound issues. +-name:offcore_requests_outstanding type:exclusive default:0x1 ++name:offcore_requests_outstanding type:exclusive default:demand_data_rd + 0x1 extra: demand_data_rd This event counts the number of offcore outstanding Demand Data Read transactions in the super queue (SQ) every cycle. A transaction is considered to be in the Offcore outstanding state between L2 miss and transaction completion sent to requestor. See the corresponding Umask under OFFCORE_REQUESTS. Note: A prefetch promoted to Demand is counted from the promotion point. + 0x2 extra: demand_code_rd This event counts the number of offcore outstanding Code Reads transactions in the super queue every cycle. The "Offcore outstanding" state of the transaction lasts from the L2 miss until the sending transaction completion to requestor (SQ deallocation). See the corresponding Umask under OFFCORE_REQUESTS. + 0x4 extra: demand_rfo This event counts the number of offcore outstanding RFO (store) transactions in the super queue (SQ) every cycle. A transaction is considered to be in the Offcore outstanding state between L2 miss and transaction completion sent to requestor (SQ de-allocation). See corresponding Umask under OFFCORE_REQUESTS. +@@ -147,14 +147,14 @@ name:br_misp_exec type:exclusive default:0xff + 0xc1 extra: all_conditional This event counts both taken and not taken speculative and retired mispredicted macro conditional branch instructions. + 0xc4 extra: all_indirect_jump_non_call_ret This event counts both taken and not taken mispredicted indirect branches excluding calls and returns. + 0xa0 extra: taken_indirect_near_call Taken speculative and retired mispredicted indirect calls +-name:idq_uops_not_delivered type:exclusive default:0x1 ++name:idq_uops_not_delivered type:exclusive default:core + 0x1 extra: core This event counts the number of uops not delivered to Resource Allocation Table (RAT) per thread adding ?4 ? x? when Resource Allocation Table (RAT) is not stalled and Instruction Decode Queue (IDQ) delivers x uops to Resource Allocation Table (RAT) (where x belongs to {0,1,2,3}). Counting does not cover cases when: a. IDQ-Resource Allocation Table (RAT) pipe serves the other thread; b. Resource Allocation Table (RAT) is stalled for the thread (including uop drops and clear BE conditions); c. Instruction Decode Queue (IDQ) delivers four uops. + 0x1 extra:cmask=4 cycles_0_uops_deliv_core This event counts, on the per-thread basis, cycles when no uops are delivered to Resource Allocation Table (RAT). IDQ_Uops_Not_Delivered.core =4. + 0x1 extra:cmask=3 cycles_le_1_uop_deliv_core This event counts, on the per-thread basis, cycles when less than 1 uop is delivered to Resource Allocation Table (RAT). IDQ_Uops_Not_Delivered.core >=3. + 0x1 extra:cmask=2 cycles_le_2_uop_deliv_core Cycles with less than 2 uops delivered by the front end + 0x1 extra:cmask=1 cycles_le_3_uop_deliv_core Cycles with less than 3 uops delivered by the front end + 0x1 extra:cmask=1,inv cycles_fe_was_ok Counts cycles FE delivered 4 uops or Resource Allocation Table (RAT) was stalling FE. +-name:uops_executed_port type:exclusive default:0x1 ++name:uops_executed_port type:exclusive default:port_0 + 0x1 extra:any port_0_core Cycles per core when uops are exectuted in port 0 + 0x2 extra:any port_1_core Cycles per core when uops are exectuted in port 1 + 0x4 extra:any port_2_core Cycles per core when uops are dispatched to port 2 +@@ -200,7 +200,7 @@ name:cycle_activity type:exclusive default:0x1 + 0xc extra:cmask=c stalls_l1d_miss Execution stalls while L1 cache miss demand load is outstanding. + 0x5 extra:cmask=5 stalls_l2_miss Execution stalls while L2 cache miss demand load is outstanding. + 0x6 extra:cmask=6 stalls_mem_any Execution stalls while memory subsystem has an outstanding load. +-name:lsd type:exclusive default:0x1 ++name:lsd type:exclusive default:uops + 0x1 extra: uops Number of Uops delivered by the LSD. Read more on LSD under LSD_REPLAY.REPLAY + 0x1 extra:cmask=4 cycles_4_uops Cycles 4 Uops delivered by the LSD, but didn't come from the decoder + 0x1 extra:cmask=1 cycles_active Cycles Uops delivered by the LSD, but didn't come from the decoder +@@ -209,7 +209,7 @@ name:offcore_requests type:exclusive default:0x1 + 0x2 extra: demand_code_rd This event counts both cacheable and noncachaeble code read requests. + 0x4 extra: demand_rfo This event counts the demand RFO (read for ownership) requests including regular RFOs, locks, ItoM. + 0x8 extra: all_data_rd This event counts the demand and prefetch data reads. All Core Data Reads include cacheable "Demands" and L2 prefetchers (not L3 prefetchers). Counting also covers reads due to page walks resulted from any request type. +-name:uops_executed type:exclusive default:0x1 ++name:uops_executed type:exclusive default:thread + 0x1 extra: thread Number of uops to be executed per-thread each cycle. + 0x2 extra: core Number of uops executed from any thread + 0x1 extra:cmask=1,inv stall_cycles This event counts cycles during which no uops were dispatched from the Reservation Station (RS) per thread. +@@ -232,20 +232,20 @@ name:other_assists type:exclusive default:0x8 + 0x8 extra: avx_to_sse This is a non-precise version (that is, does not use PEBS) of the event that counts the number of transitions from AVX-256 to legacy SSE when penalty is applicable. + 0x10 extra: sse_to_avx This is a non-precise version (that is, does not use PEBS) of the event that counts the number of transitions from legacy SSE to AVX-256 when penalty is applicable. + 0x40 extra: any_wb_assist Number of times any microcode assist is invoked by HW upon uop writeback. +-name:uops_retired type:exclusive default:0x1 ++name:uops_retired type:exclusive default:all + 0x1 extra: all This is a non-precise version (that is, does not use PEBS) of the event that counts all actually retired uops. Counting increments by two for micro-fused uops, and by one for macro-fused and other uops. Maximal increment value for one cycle is eight. + 0x1 extra: all_pebs Counts all actually retired uops. Counting increments by two for micro-fused uops, and by one for macro-fused and other uops. Maximal increment value for one cycle is eight. + 0x2 extra: retire_slots This is a non-precise version (that is, does not use PEBS) of the event that counts the number of retirement slots used. + 0x2 extra: retire_slots_pebs Counts the number of retirement slots used. + 0x1 extra:cmask=1,inv stall_cycles This is a non-precise version (that is, does not use PEBS) of the event that counts cycles without actually retired uops. + 0x1 extra:cmask=a,inv total_cycles Number of cycles using always true condition (uops_ret < 16) applied to non PEBS uops retired event. +-name:machine_clears type:exclusive default:0x1 ++name:machine_clears type:exclusive default:cycles + 0x1 extra: cycles This event counts both thread-specific (TS) and all-thread (AT) nukes. + 0x2 extra: memory_ordering This event counts the number of memory ordering Machine Clears detected. Memory Ordering Machine Clears can result from one of the following: 1. memory disambiguation, 2. external snoop, or 3. cross SMT-HW-thread snoop (stores) hitting load buffer. + 0x4 extra: smc This event counts self-modifying code (SMC) detected, which causes a machine clear. + 0x20 extra: maskmov Maskmov false fault - counts number of time ucode passes through Maskmov flow due to instruction's mask being 0 while the flow was completed without raising a fault. + 0x1 extra:cmask=1,edge count Number of machine clears (nukes) of any type. +-name:br_inst_retired type:exclusive default:0x1 ++name:br_inst_retired type:exclusive default:conditional + 0x1 extra: conditional This is a non-precise version (that is, does not use PEBS) of the event that counts conditional branch instructions retired. + 0x1 extra: conditional_pebs Counts conditional branch instructions retired. + 0x2 extra: near_call This is a non-precise version (that is, does not use PEBS) of the event that counts both direct and indirect near call instructions retired. +@@ -257,7 +257,7 @@ name:br_inst_retired type:exclusive default:0x1 + 0x20 extra: near_taken_pebs Counts taken branch instructions retired. + 0x40 extra: far_branch This is a non-precise version (that is, does not use PEBS) of the event that counts far branch instructions retired. + 0x4 extra:pebs all_branches_pebs This is a precise version of BR_INST_RETIRED.ALL_BRANCHES that counts all (macro) branch instructions retired. +-name:br_misp_retired type:exclusive default:0x1 ++name:br_misp_retired type:exclusive default:conditional + 0x1 extra: conditional This is a non-precise version (that is, does not use PEBS) of the event that counts mispredicted conditional branch instructions retired. + 0x1 extra: conditional_pebs Counts mispredicted conditional branch instructions retired. + 0x4 extra:pebs all_branches_pebs This is a precise version of BR_MISP_RETIRED.ALL_BRANCHES that counts all mispredicted macro branch instructions retired. +@@ -289,7 +289,7 @@ name:fp_assist type:exclusive default:0x1e + 0x4 extra: x87_input This is a non-precise version (that is, does not use PEBS) of the event that counts x87 floating point (FP) micro-code assist (invalid operation, denormal operand, SNaN operand) when the input value (one of the source operands to an FP instruction) is invalid. + 0x8 extra: simd_output This is a non-precise version (that is, does not use PEBS) of the event that counts the number of SSE* floating point (FP) micro-code assist (numeric overflow/underflow) when the output value (destination register) is invalid. Counting covers only cases involving penalties that require micro-code assist intervention. + 0x10 extra: simd_input This is a non-precise version (that is, does not use PEBS) of the event that counts any input SSE* FP assist - invalid operation, denormal operand, dividing by zero, SNaN operand. Counting includes only cases involving penalties that required micro-code assist intervention. +-name:mem_uops_retired type:exclusive default:0x11 ++name:mem_uops_retired type:exclusive default:stlb_miss_loads + 0x11 extra: stlb_miss_loads This is a non-precise version (that is, does not use PEBS) of the event that counts load uops with true STLB miss retired to the architected path. True STLB miss is an uop triggering page walk that gets completed without blocks, and later gets retired. This page walk can end up with or without a fault. + 0x11 extra: stlb_miss_loads_pebs Counts load uops with true STLB miss retired to the architected path. True STLB miss is an uop triggering page walk that gets completed without blocks, and later gets retired. This page walk can end up with or without a fault. + 0x12 extra: stlb_miss_stores This is a non-precise version (that is, does not use PEBS) of the event that counts store uops with true STLB miss retired to the architected path. True STLB miss is an uop triggering page walk that gets completed without blocks, and later gets retired. This page walk can end up with or without a fault. +@@ -304,7 +304,7 @@ name:mem_uops_retired type:exclusive default:0x11 + 0x81 extra: all_loads_pebs Counts load uops retired to the architected path with a filter on bits 0 and 1 applied. Note: This event ?ounts AVX-256bit load/store double-pump memory uops as a single uop at retirement. This event also counts SW prefetches. + 0x82 extra: all_stores This is a non-precise version (that is, does not use PEBS) of the event that counts store uops retired to the architected path with a filter on bits 0 and 1 applied. Note: This event ?ounts AVX-256bit load/store double-pump memory uops as a single uop at retirement. + 0x82 extra: all_stores_pebs Counts store uops retired to the architected path with a filter on bits 0 and 1 applied. Note: This event ?ounts AVX-256bit load/store double-pump memory uops as a single uop at retirement. +-name:mem_load_uops_retired type:exclusive default:0x1 ++name:mem_load_uops_retired type:exclusive default:l1_hit + 0x1 extra: l1_hit This is a non-precise version (that is, does not use PEBS) of the event that counts retired load uops which data sources were hits in the nearest-level (L1) cache. Note: Only two data-sources of L1/FB are applicable for AVX-256bit even though the corresponding AVX load could be serviced by a deeper level in the memory hierarchy. Data source is reported for the Low-half load. This event also counts SW prefetches independent of the actual data source + 0x1 extra: l1_hit_pebs Counts retired load uops which data sources were hits in the nearest-level (L1) cache. Note: Only two data-sources of L1/FB are applicable for AVX-256bit even though the corresponding AVX load could be serviced by a deeper level in the memory hierarchy. Data source is reported for the Low-half load. This event also counts SW prefetches independent of the actual data source + 0x2 extra: l2_hit This is a non-precise version (that is, does not use PEBS) of the event that counts retired load uops which data sources were hits in the mid-level (L2) cache. +@@ -319,7 +319,7 @@ name:mem_load_uops_retired type:exclusive default:0x1 + 0x20 extra: l3_miss_pebs Miss in last-level (L3) cache. Excludes Unknown data-source. + 0x40 extra: hit_lfb This is a non-precise version (that is, does not use PEBS) of the event that counts retired load uops which data sources were load uops missed L1 but hit a fill buffer due to a preceding miss to the same cache line with the data not ready. Note: Only two data-sources of L1/FB are applicable for AVX-256bit even though the corresponding AVX load could be serviced by a deeper level in the memory hierarchy. Data source is reported for the Low-half load. + 0x40 extra: hit_lfb_pebs Counts retired load uops which data sources were load uops missed L1 but hit a fill buffer due to a preceding miss to the same cache line with the data not ready. Note: Only two data-sources of L1/FB are applicable for AVX-256bit even though the corresponding AVX load could be serviced by a deeper level in the memory hierarchy. Data source is reported for the Low-half load. +-name:mem_load_uops_l3_hit_retired type:exclusive default:0x1 ++name:mem_load_uops_l3_hit_retired type:exclusive default:xsnp_miss + 0x1 extra: xsnp_miss This is a non-precise version (that is, does not use PEBS) of the event that counts retired load uops which data sources were L3 Hit and a cross-core snoop missed in the on-pkg core cache. + 0x1 extra: xsnp_miss_pebs Counts retired load uops which data sources were L3 Hit and a cross-core snoop missed in the on-pkg core cache. + 0x2 extra: xsnp_hit This is a non-precise version (that is, does not use PEBS) of the event that counts retired load uops which data sources were L3 hit and a cross-core snoop hit in the on-pkg core cache. +@@ -328,7 +328,7 @@ name:mem_load_uops_l3_hit_retired type:exclusive default:0x1 + 0x4 extra: xsnp_hitm_pebs Counts retired load uops which data sources were HitM responses from a core on same socket (shared L3). + 0x8 extra: xsnp_none This is a non-precise version (that is, does not use PEBS) of the event that counts retired load uops which data sources were hits in the last-level (L3) cache without snoops required. + 0x8 extra: xsnp_none_pebs Counts retired load uops which data sources were hits in the last-level (L3) cache without snoops required. +-name:mem_load_uops_l3_miss_retired type:exclusive default:0x1 ++name:mem_load_uops_l3_miss_retired type:exclusive default:local_dram + 0x1 extra: local_dram Retired load uop whose Data Source was: local DRAM either Snoop not needed or Snoop Miss (RspI) + 0x1 extra: local_dram_pebs Retired load uop whose Data Source was: local DRAM either Snoop not needed or Snoop Miss (RspI) + name:l2_trans type:exclusive default:0x80 +commit 723a3042bd23deca01a36f6d99cdf10fe935c0d0 +Author: William Cohen +Date: Thu Jun 11 16:56:16 2015 -0400 + + Use a named default for the Intel Broadwell cycle_activity default unit_mask + + Since default unit mask for Intel Broadwell cycle_activity cannot be + uniquely specified by numbers, the default has to be replaced by a + named one. + + Signed-off-by: William Cohen + +diff --git a/events/i386/broadwell/unit_masks b/events/i386/broadwell/unit_masks +index 4e69363..505ba21 100644 +--- a/events/i386/broadwell/unit_masks ++++ b/events/i386/broadwell/unit_masks +@@ -185,7 +185,7 @@ name:resource_stalls type:exclusive default:0x1 + 0x4 extra: rs This event counts stall cycles caused by absence of eligible entries in the reservation station (RS). This may result from RS overflow, or from RS deallocation because of the RS array Write Port allocation scheme (each RS entry has two write ports instead of four. As a result, empty entries could not be used, although RS is not really full). This counts cycles that the pipeline backend blocked uop delivery from the front end. + 0x8 extra: sb This event counts stall cycles caused by the store buffer (SB) overflow (excluding draining from synch). This counts cycles that the pipeline backend blocked uop delivery from the front end. + 0x10 extra: rob This event counts ROB full stall cycles. This counts cycles that the pipeline backend blocked uop delivery from the front end. +-name:cycle_activity type:exclusive default:0x1 ++name:cycle_activity type:exclusive default:cycles_l2_pending + 0x1 extra:cmask=1 cycles_l2_pending Counts number of cycles the CPU has at least one pending demand* load request missing the L2 cache. + 0x8 extra:cmask=8 cycles_l1d_pending Counts number of cycles the CPU has at least one pending demand load request missing the L1 data cache. + 0x2 extra:cmask=2 cycles_ldm_pending Counts number of cycles the CPU has at least one pending demand load request (that is cycles with non-completed load waiting for its data from memory subsystem) diff --git a/SOURCES/oprofile-bz1264443.patch b/SOURCES/oprofile-bz1264443.patch new file mode 100644 index 0000000..5a340e6 --- /dev/null +++ b/SOURCES/oprofile-bz1264443.patch @@ -0,0 +1,93 @@ +diff -up oprofile-0.9.9/libpp/profile_spec.cpp.archive oprofile-0.9.9/libpp/profile_spec.cpp +--- oprofile-0.9.9/libpp/profile_spec.cpp.archive 2013-07-29 11:55:06.000000000 -0400 ++++ oprofile-0.9.9/libpp/profile_spec.cpp 2016-07-06 11:20:55.076624764 -0400 +@@ -102,6 +102,8 @@ void profile_spec::set_image_or_lib_name + void profile_spec::parse_archive_path(string const & str) + { + archive_path = op_realpath(str); ++ /* Need to force session directory default location in the archive */ ++ init_op_config_dirs(OP_SESSION_DIR_DEFAULT); + } + + +diff -up oprofile-0.9.9/pp/oparchive.cpp.archive oprofile-0.9.9/pp/oparchive.cpp +--- oprofile-0.9.9/pp/oparchive.cpp.archive 2013-07-29 11:55:06.000000000 -0400 ++++ oprofile-0.9.9/pp/oparchive.cpp 2016-07-06 11:20:55.076624764 -0400 +@@ -232,6 +232,19 @@ int oparchive(options::spec const & spec + } + } + ++ /* place samples and other related material in easily found default directory */ ++ string dest_session_dir = options::outdirectory + string(OP_SESSION_DIR_DEFAULT); ++ string dest_samples_dir = dest_session_dir + string("samples"); ++ ++ /* dest_session_dir is parent of dest_samples and will also created */ ++ ++ if (!options::list_files && ++ create_path(dest_samples_dir.c_str())) { ++ cerr << "Unable to create directory for " ++ << dest_samples_dir << "." << endl; ++ exit (EXIT_FAILURE); ++ } ++ + /* copy over each of the sample files */ + list::iterator sit = sample_files.begin(); + list::iterator const send = sample_files.end(); +@@ -245,9 +258,13 @@ int oparchive(options::spec const & spec + + for (; sit != send; ++sit) { + string sample_name = *sit; ++ /* determine the session name of sample file */ ++ int offset = sample_name.find('{'); ++ string base_samples_dir = sample_name.substr(0, offset-1); ++ string session = basename(base_samples_dir.c_str()); + /* Get rid of the the archive_path from the name */ +- string sample_base = sample_name.substr(archive_path.size()); +- string sample_archive_file = options::outdirectory + sample_base; ++ string sample_base = sample_name.substr(offset); ++ string sample_archive_file = dest_samples_dir + "/" + session + "/" + sample_base; + + cverb << vdebug << sample_name << endl; + cverb << vdebug << " destp " << sample_archive_file << endl; +@@ -268,19 +285,19 @@ int oparchive(options::spec const & spec + cerr << "Unable to to obtain realpath for " << op_session_dir << endl; + exit (EXIT_FAILURE); + } +- string abi_name = string(real_session_dir) + "/abi"; +- copy_one_file(image_ok, archive_path + abi_name, +- options::outdirectory + abi_name); ++ string abi_name = string(real_session_dir) + string("/abi"); ++ string dest_abi_name = dest_session_dir + string("/abi"); ++ copy_one_file(image_ok, archive_path + abi_name, dest_abi_name); + + /* copy over the /samples/oprofiled.log file */ +- string log_name = string(real_session_dir) + string("/samples") + "/oprofiled.log"; +- copy_one_file(image_ok, archive_path + log_name, +- options::outdirectory + log_name); ++ string log_name = string(real_session_dir) + string("/samples") + string("/oprofiled.log"); ++ string dest_log_name = dest_samples_dir + string("/oprofiled.log"); ++ copy_one_file(image_ok, archive_path + log_name, dest_log_name); + + /* copy over the /samples/operf.log file */ +- log_name = string(real_session_dir) + string("/samples") + "/operf.log"; +- copy_one_file(image_ok, archive_path + log_name, +- options::outdirectory + log_name); ++ log_name = string(real_session_dir) + string("/samples") + string("/operf.log"); ++ dest_log_name = dest_samples_dir + string("/operf.log"); ++ copy_one_file(image_ok, archive_path + log_name, dest_log_name); + + free(real_session_dir); + +diff -up oprofile-0.9.9/pp/oparchive_options.cpp.archive oprofile-0.9.9/pp/oparchive_options.cpp +--- oprofile-0.9.9/pp/oparchive_options.cpp.archive 2016-07-06 11:20:55.077624764 -0400 ++++ oprofile-0.9.9/pp/oparchive_options.cpp 2016-07-06 11:26:13.968624764 -0400 +@@ -124,7 +124,6 @@ void handle_options(options::spec const + + if (strncmp(op_session_dir, "/var/lib/oprofile", strlen("/var/lib/oprofile"))) + cerr << "NOTE: The sample data in this archive is located at " << op_session_dir << endl +- << "instead of the standard location of /var/lib/oprofile. Hence, when using opreport" << endl +- << "and other post-processing tools on this archive, you must pass the following option:" << endl +- << "\t--session-dir=" << op_session_dir << endl; ++ << "and is being moved to the standard location of " << OP_SESSION_DIR_DEFAULT << "." ++ << endl; + } diff --git a/SOURCES/oprofile-bz1335145.patch b/SOURCES/oprofile-bz1335145.patch new file mode 100644 index 0000000..4eaf6a6 --- /dev/null +++ b/SOURCES/oprofile-bz1335145.patch @@ -0,0 +1,185 @@ +commit a99127699330dce984dba38156230ab3584d0d6e +Author: William Cohen +Date: Mon Nov 30 17:13:32 2015 -0500 + + Make Intel Westmere and Nehalem event names unique + + The Intel Westmere and Nehalem event lists each had two events named + MACRO_INSTS. The event names in the event lists need to be unique. + The event refererring to the Macro-fused instructions decoded (0xa6) + has been renamed MACRO_INSTS_FUSED to avoid the name collision with + MACRO_INSTS. + + Signed-off-by: William Cohen + +diff --git a/events/i386/nehalem/events b/events/i386/nehalem/events +index 31a08b6..6951f35 100644 +--- a/events/i386/nehalem/events ++++ b/events/i386/nehalem/events +@@ -68,7 +68,7 @@ event:0x87 counters:0,1,2,3 um:ild_stall minimum:6000 name:ILD_STALL : Cycles In + event:0x88 counters:0,1,2,3 um:br_inst_exec minimum:6000 name:BR_INST_EXEC : Counts the number of near branch instructions executed, but not necessarily retired. + event:0x89 counters:0,1,2,3 um:br_misp_exec minimum:6000 name:BR_MISP_EXEC : Counts the number of mispredicted conditional near branch instructions executed, but not necessarily retired. + event:0xA2 counters:0,1,2,3 um:resource_stalls minimum:6000 name:RESOURCE_STALLS : Counts the number of Allocator resource related stalls. Includes register renaming buffer entries, memory buffer entries. In addition to resource related stalls, this event counts some other events. Includes stalls arising during branch misprediction recovery, such as if retirement of the mispredicted branch is delayed and stalls arising while store buffer is draining from synchronizing operations. +-event:0xA6 counters:0,1,2,3 um:one minimum:6000 name:MACRO_INSTS : Counts the number of instructions decoded that are macro-fused but not necessarily executed or retired. ++event:0xA6 counters:0,1,2,3 um:one minimum:6000 name:MACRO_INSTS_FUSED : Counts the number of instructions decoded that are macro-fused but not necessarily executed or retired. + event:0xA7 counters:0,1,2,3 um:one minimum:6000 name:BACLEAR_FORCE_IQ : Counts number of times a BACLEAR was forced by the Instruction Queue. The IQ is also responsible for providing conditional branch prediciton direction based on a static scheme and dynamic data provided by the L2 Branch Prediction Unit. If the conditional branch target is not found in the Target Array and the IQ predicts that the branch is taken, then the IQ will force the Branch Address Calculator to issue a BACLEAR. Each BACLEAR asserted by the BAC generates approximately an 8 cycle bubble in the instruction fetch pipeline. + event:0xA8 counters:0,1,2,3 um:one minimum:6000 name:LSD : Counts the number of micro-ops delivered by loop stream detector + event:0xAE counters:0,1,2,3 um:one minimum:6000 name:ITLB_FLUSH : Counts the number of ITLB flushes +diff --git a/events/i386/westmere/events b/events/i386/westmere/events +index d919867..d7b2064 100644 +--- a/events/i386/westmere/events ++++ b/events/i386/westmere/events +@@ -48,7 +48,7 @@ event:0x87 counters:0,1,2,3 um:ild_stall minimum:2000000 name:ILD_STALL : Any In + event:0x88 counters:0,1,2,3 um:br_inst_exec minimum:200000 name:BR_INST_EXEC : Branch instructions executed + event:0x89 counters:0,1,2,3 um:br_misp_exec minimum:20000 name:BR_MISP_EXEC : Mispredicted branches executed + event:0xa2 counters:0,1,2,3 um:resource_stalls minimum:2000000 name:RESOURCE_STALLS : Resource related stall cycles +-event:0xa6 counters:0,1,2,3 um:x01 minimum:2000000 name:MACRO_INSTS : Macro-fused instructions decoded ++event:0xa6 counters:0,1,2,3 um:x01 minimum:2000000 name:MACRO_INSTS_FUSED : Macro-fused instructions decoded + event:0xa7 counters:0,1,2,3 um:x01 minimum:2000000 name:BACLEAR_FORCE_IQ : Instruction queue forced BACLEAR + event:0xa8 counters:0,1,2,3 um:x01 minimum:2000000 name:LSD : Cycles when uops were delivered by the LSD + event:0xae counters:0,1,2,3 um:x01 minimum:2000000 name:ITLB_FLUSH : ITLB flushes +commit dc9076e99c9afada60cbe81dd43772cb72ec509d +Author: Michael Petlan +Date: Thu Apr 30 10:34:48 2015 -0400 + + Fix default unit masks for Haswells + + Since some of the default unit masks for Haswell events cannot be + uniquely specified by numbers, the defaults have had to be replaced + by the named ones. When the affected events are used on Haswell without + specifying unit masks after applying this patch, the default masks + are chosen correctly. + + Signed-off-by: Michael Petlan + +diff --git a/events/i386/haswell/unit_masks b/events/i386/haswell/unit_masks +index 60c2a61..9b4be33 100644 +--- a/events/i386/haswell/unit_masks ++++ b/events/i386/haswell/unit_masks +@@ -32,7 +32,7 @@ name:dtlb_load_misses type:exclusive default:0x1 + 0x80 extra: pde_cache_miss DTLB demand load misses with low part of linear-to-physical address translation missed + 0xe extra: walk_completed Demand load Miss in all translation lookaside buffer (TLB) levels causes a page walk that completes of any page size. + 0x60 extra: stlb_hit Load operations that miss the first DTLB level but hit the second and do not cause page walks +-name:uops_issued type:exclusive default:0x1 ++name:uops_issued type:exclusive default:any + 0x1 extra: any This event counts the number of uops issued by the Front-end of the pipeline to the Back-end. This event is counted at the allocation stage and will count both retired and non-retired uops. + 0x10 extra: flags_merge Number of flags-merge uops being allocated. Such uops considered perf sensitive; added by GSR u-arch. + 0x20 extra: slow_lea Number of slow LEA uops being allocated. A uop is generally considered SlowLea if it has 3 sources (e.g. 2 sources + immediate) regardless if as a result of LEA instruction or not. +@@ -56,7 +56,7 @@ name:l2_rqsts type:exclusive default:0x21 + 0xe7 extra: all_demand_references Demand requests to L2 cache + 0x3f extra: miss All requests that miss L2 cache + 0xff extra: references All L2 requests +-name:l1d_pend_miss type:exclusive default:0x1 ++name:l1d_pend_miss type:exclusive default:pending + 0x1 extra: pending L1D miss oustandings duration in cycles + 0x1 extra:cmask=1 pending_cycles Cycles with L1D load Misses outstanding. + name:dtlb_store_misses type:exclusive default:0x1 +@@ -85,7 +85,7 @@ name:move_elimination type:exclusive default:0x1 + 0x2 extra: simd_eliminated Number of SIMD Move Elimination candidate uops that were eliminated. + 0x4 extra: int_not_eliminated Number of integer Move Elimination candidate uops that were not eliminated. + 0x8 extra: simd_not_eliminated Number of SIMD Move Elimination candidate uops that were not eliminated. +-name:cpl_cycles type:exclusive default:0x1 ++name:cpl_cycles type:exclusive default:ring0 + 0x1 extra: ring0 Unhalted core cycles when the thread is in ring 0 + 0x2 extra: ring123 Unhalted core cycles when thread is in rings 1, 2, or 3 + 0x1 extra:cmask=1,edge ring0_trans Number of intervals between processor halts while thread is in ring 0 +@@ -95,10 +95,10 @@ name:tx_exec type:exclusive default:0x1 + 0x4 extra: misc3 Counts the number of times an instruction execution caused the transactional nest count supported to be exceeded + 0x8 extra: misc4 Counts the number of times a XBEGIN instruction was executed inside an HLE transactional region. + 0x10 extra: misc5 Counts the number of times an HLE XACQUIRE instruction was executed inside an RTM transactional region +-name:rs_events type:exclusive default:0x1 ++name:rs_events type:exclusive default:empty_cycles + 0x1 extra: empty_cycles This event counts cycles when the Reservation Station ( RS ) is empty for the thread. The RS is a structure that buffers allocated micro-ops from the Front-end. If there are many cycles when the RS is empty, it may represent an underflow of instructions delivered from the Front-end. + 0x1 extra:cmask=1,inv,edge empty_end Counts end of periods where the Reservation Station (RS) was empty. Could be useful to precisely locate Frontend Latency Bound issues. +-name:offcore_requests_outstanding type:exclusive default:0x1 ++name:offcore_requests_outstanding type:exclusive default:demand_data_rd + 0x1 extra: demand_data_rd Offcore outstanding Demand Data Read transactions in uncore queue. + 0x2 extra: demand_code_rd Offcore outstanding code reads transactions in SuperQueue (SQ), queue to uncore, every cycle + 0x4 extra: demand_rfo Offcore outstanding RFO store transactions in SuperQueue (SQ), queue to uncore +@@ -164,14 +164,14 @@ name:br_misp_exec type:exclusive default:0xff + 0xc1 extra: all_conditional Speculative and retired mispredicted macro conditional branches + 0xc4 extra: all_indirect_jump_non_call_ret Mispredicted indirect branches excluding calls and returns + 0xa0 extra: taken_indirect_near_call Taken speculative and retired mispredicted indirect calls +-name:idq_uops_not_delivered type:exclusive default:0x1 ++name:idq_uops_not_delivered type:exclusive default:core + 0x1 extra: core This event count the number of undelivered (unallocated) uops from the Front-end to the Resource Allocation Table (RAT) while the Back-end of the processor is not stalled. The Front-end can allocate up to 4 uops per cycle so this event can increment 0-4 times per cycle depending on the number of unallocated uops. This event is counted on a per-core basis. + 0x1 extra:cmask=4 cycles_0_uops_deliv_core This event counts the number cycles during which the Front-end allocated exactly zero uops to the Resource Allocation Table (RAT) while the Back-end of the processor is not stalled. This event is counted on a per-core basis. + 0x1 extra:cmask=3 cycles_le_1_uop_deliv_core Cycles per thread when 3 or more uops are not delivered to Resource Allocation Table (RAT) when backend of the machine is not stalled + 0x1 extra:cmask=2 cycles_le_2_uop_deliv_core Cycles with less than 2 uops delivered by the front end. + 0x1 extra:cmask=1 cycles_le_3_uop_deliv_core Cycles with less than 3 uops delivered by the front end. + 0x1 extra:cmask=1,inv cycles_fe_was_ok Counts cycles FE delivered 4 uops or Resource Allocation Table (RAT) was stalling FE. +-name:uops_executed_port type:exclusive default:0x1 ++name:uops_executed_port type:exclusive default:port_0 + 0x1 extra: port_0 Cycles per thread when uops are executed in port 0 + 0x2 extra: port_1 Cycles per thread when uops are executed in port 1 + 0x4 extra: port_2 Cycles per thread when uops are executed in port 2 +@@ -236,7 +236,7 @@ name:other_assists type:exclusive default:0x8 + 0x8 extra: avx_to_sse Number of transitions from AVX-256 to legacy SSE when penalty applicable. Errata: HSM57 + 0x10 extra: sse_to_avx Number of transitions from SSE to AVX-256 when penalty applicable. Errata: HSM57 + 0x40 extra: any_wb_assist Number of times any microcode assist is invoked by HW upon uop writeback. +-name:uops_retired type:exclusive default:0x1 ++name:uops_retired type:exclusive default:all + 0x1 extra: all Actually retired uops. + 0x1 extra: all_pebs Actually retired uops. + 0x2 extra: retire_slots This event counts the number of retirement slots used each cycle. There are potentially 4 slots that can be used each cycle - meaning, 4 uops or 4 instructions could retire each cycle. +@@ -244,13 +244,13 @@ name:uops_retired type:exclusive default:0x1 + 0x1 extra:cmask=1,inv stall_cycles Cycles without actually retired uops. + 0x1 extra:cmask=a,inv total_cycles Cycles with less than 10 actually retired uops. + 0x1 extra:cmask=1,inv core_stall_cycles Cycles without actually retired uops. +-name:machine_clears type:exclusive default:0x1 ++name:machine_clears type:exclusive default:cycles + 0x1 extra: cycles Cycles there was a Nuke. Account for both thread-specific and All Thread Nukes. + 0x2 extra: memory_ordering This event counts the number of memory ordering machine clears detected. Memory ordering machine clears can result from memory address aliasing or snoops from another hardware thread or core to data inflight in the pipeline. Machine clears can have a significant performance impact if they are happening frequently. + 0x4 extra: smc This event is incremented when self-modifying code (SMC) is detected, which causes a machine clear. Machine clears can have a significant performance impact if they are happening frequently. + 0x20 extra: maskmov This event counts the number of executed Intel AVX masked load operations that refer to an illegal address range with the mask bits set to 0. + 0x1 extra:cmask=1,edge count Number of machine clears (nukes) of any type. +-name:br_inst_retired type:exclusive default:0x1 ++name:br_inst_retired type:exclusive default:conditional + 0x1 extra: conditional Conditional branch instructions retired. + 0x1 extra: conditional_pebs Conditional branch instructions retired. + 0x2 extra: near_call Direct and indirect near call instructions retired. +@@ -262,7 +262,7 @@ name:br_inst_retired type:exclusive default:0x1 + 0x20 extra: near_taken_pebs Taken branch instructions retired. + 0x40 extra: far_branch Far branch instructions retired. + 0x4 extra:pebs all_branches_pebs All (macro) branch instructions retired. +-name:br_misp_retired type:exclusive default:0x1 ++name:br_misp_retired type:exclusive default:conditional + 0x1 extra: conditional Mispredicted conditional branch instructions retired. + 0x1 extra: conditional_pebs Mispredicted conditional branch instructions retired. + 0x4 extra:pebs all_branches_pebs This event counts all mispredicted branch instructions retired. This is a precise event. +@@ -294,7 +294,7 @@ name:fp_assist type:exclusive default:0x1e + 0x4 extra: x87_input Number of X87 assists due to input value. + 0x8 extra: simd_output Number of SIMD FP assists due to Output values + 0x10 extra: simd_input Number of SIMD FP assists due to input values +-name:mem_uops_retired type:exclusive default:0x11 ++name:mem_uops_retired type:exclusive default:stlb_miss_loads + 0x11 extra: stlb_miss_loads Load uops with true STLB miss retired to architected path. Errata: HSM30 + 0x11 extra: stlb_miss_loads_pebs Load uops with true STLB miss retired to architected path. Errata: HSM30 + 0x12 extra: stlb_miss_stores Store uops with true STLB miss retired to architected path. Errata: HSM30 +@@ -309,7 +309,7 @@ name:mem_uops_retired type:exclusive default:0x11 + 0x81 extra: all_loads_pebs Load uops retired to architected path with filter on bits 0 and 1 applied. Errata: HSM30 + 0x82 extra: all_stores Store uops retired to architected path with filter on bits 0 and 1 applied. Errata: HSM30 + 0x82 extra: all_stores_pebs Store uops retired to architected path with filter on bits 0 and 1 applied. Errata: HSM30 +-name:mem_load_uops_retired type:exclusive default:0x1 ++name:mem_load_uops_retired type:exclusive default:l1_hit + 0x1 extra: l1_hit Retired load uops with L1 cache hits as data sources. Errata: HSM30 + 0x1 extra: l1_hit_pebs Retired load uops with L1 cache hits as data sources. Errata: HSM30 + 0x2 extra: l2_hit Retired load uops with L2 cache hits as data sources. Errata: HSM30 +@@ -324,7 +324,7 @@ name:mem_load_uops_retired type:exclusive default:0x1 + 0x20 extra: l3_miss_pebs Miss in last-level (L3) cache. Excludes Unknown data-source. Errata: HSM26, HSM30 + 0x40 extra: hit_lfb Retired load uops which data sources were load uops missed L1 but hit FB due to preceding miss to the same cache line with data not ready. Errata: HSM30 + 0x40 extra: hit_lfb_pebs Retired load uops which data sources were load uops missed L1 but hit FB due to preceding miss to the same cache line with data not ready. Errata: HSM30 +-name:mem_load_uops_l3_hit_retired type:exclusive default:0x1 ++name:mem_load_uops_l3_hit_retired type:exclusive default:xsnp_miss + 0x1 extra: xsnp_miss Retired load uops which data sources were L3 hit and cross-core snoop missed in on-pkg core cache. Errata: HSM26, HSM30 + 0x1 extra: xsnp_miss_pebs Retired load uops which data sources were L3 hit and cross-core snoop missed in on-pkg core cache. Errata: HSM26, HSM30 + 0x2 extra: xsnp_hit Retired load uops which data sources were L3 and cross-core snoop hits in on-pkg core cache. Errata: HSM26, HSM30 +@@ -333,7 +333,7 @@ name:mem_load_uops_l3_hit_retired type:exclusive default:0x1 + 0x4 extra: xsnp_hitm_pebs Retired load uops which data sources were HitM responses from shared L3. Errata: HSM26, HSM30 + 0x8 extra: xsnp_none Retired load uops which data sources were hits in L3 without snoops required. Errata: HSM26, HSM30 + 0x8 extra: xsnp_none_pebs Retired load uops which data sources were hits in L3 without snoops required. Errata: HSM26, HSM30 +-name:mem_load_uops_l3_miss_retired type:exclusive default:0x1 ++name:mem_load_uops_l3_miss_retired type:exclusive default:local_dram + 0x1 extra: local_dram This event counts retired load uops where the data came from local DRAM. This does not include hardware prefetches. Errata: HSM30 + 0x1 extra: local_dram_pebs This event counts retired load uops where the data came from local DRAM. This does not include hardware prefetches. Errata: HSM30 + name:l2_trans type:exclusive default:0x80 diff --git a/SOURCES/oprofile-captest.patch b/SOURCES/oprofile-captest.patch new file mode 100644 index 0000000..6f16604 --- /dev/null +++ b/SOURCES/oprofile-captest.patch @@ -0,0 +1,13 @@ +diff --git a/libpe_utils/op_pe_utils.cpp b/libpe_utils/op_pe_utils.cpp +index c5b6ee7..0550fa7 100644 +--- a/libpe_utils/op_pe_utils.cpp ++++ b/libpe_utils/op_pe_utils.cpp +@@ -174,6 +174,8 @@ int op_pe_utils::op_check_perf_events_cap(bool use_cpu_minus_one) + memset(&attr, 0, sizeof(attr)); + attr.size = sizeof(attr); + attr.sample_type = PERF_SAMPLE_IP; ++ /* avoid kernel events so test works when perf_event_paranoid = 2 */ ++ attr.exclude_kernel =1; + + pid = getpid(); + syscall(__NR_perf_event_open, &attr, pid, cpu_to_try, -1, 0); diff --git a/SOURCES/oprofile-coverity.patch b/SOURCES/oprofile-coverity.patch new file mode 100644 index 0000000..bdba88d --- /dev/null +++ b/SOURCES/oprofile-coverity.patch @@ -0,0 +1,320 @@ +commit be6d22999668ac976acd2008ec13db4385a0c8dd +Author: Maynard Johnson +Date: Mon Jan 27 15:44:18 2014 -0600 + + Fix issues detected by Coverity + + Will Cohen ran Coverity against oprofile and reported some issues + on Nov 20, 2013. I submitted the current oprofile source to the + Coverity webpage, and a couple new issues were detected. This + patch addresses most of these issues. Some issues are either + false positives from Coverity's analysis or have been marked + as "Intentional" so as to have Coverity ignore them. + + Signed-off-by: Maynard Johnson + +diff --git a/daemon/init.c b/daemon/init.c +index 2882c49..1fed812 100644 +--- a/daemon/init.c ++++ b/daemon/init.c +@@ -154,7 +154,7 @@ static void opd_do_jitdumps(void) + struct timeval tv; + char end_time_str[32]; + char opjitconv_path[PATH_MAX + 1]; +- char * exec_args[7]; ++ char * exec_args[8]; + + if (jit_conversion_running) + return; +@@ -175,6 +175,7 @@ static void opd_do_jitdumps(void) + if (vmisc) + exec_args[arg_num++] = "-d"; + exec_args[arg_num++] = "--delete-jitdumps"; ++ exec_args[arg_num++] = "--session-dir"; + exec_args[arg_num++] = session_dir; + exec_args[arg_num++] = start_time_str; + exec_args[arg_num++] = end_time_str; +diff --git a/libpe_utils/op_pe_utils.cpp b/libpe_utils/op_pe_utils.cpp +index aa0c1c5..0b7482f 100644 +--- a/libpe_utils/op_pe_utils.cpp ++++ b/libpe_utils/op_pe_utils.cpp +@@ -487,7 +487,7 @@ handle_named_um: + (endptr <= (mask + strlen(mask) - 2))) { // '- 2' to account for linefeed and '\0' + + // Must be a default named unit mask +- strncpy(event->um_name, mask, OP_MAX_UM_NAME_LEN); ++ strncpy(event->um_name, mask, OP_MAX_UM_NAME_LEN - 1); + goto handle_named_um; + } + config |= ((event->evt_um & 0xFFULL) << 8); +diff --git a/libutil++/op_bfd.h b/libutil++/op_bfd.h +index 6ce71fa..1aa7e10 100644 +--- a/libutil++/op_bfd.h ++++ b/libutil++/op_bfd.h +@@ -334,8 +334,8 @@ private: + bfd_vma vma_adj; + + /** +- * The file descriptor for an image file that we pass to fdopen_bfd must be kep +- * open through the life of the op_bfd to enable proper beahvior of certain ++ * The file descriptor for an image file that we pass to fdopen_bfd must be kept ++ * open through the life of the op_bfd to enable proper behavior of certain + * BFD functions -- in particular, bfd_find_nearest_line(). + */ + int fd; +diff --git a/libutil++/op_spu_bfd.cpp b/libutil++/op_spu_bfd.cpp +index 4ac5245..29d6e06 100644 +--- a/libutil++/op_spu_bfd.cpp ++++ b/libutil++/op_spu_bfd.cpp +@@ -50,7 +50,7 @@ op_bfd::op_bfd(uint64_t spu_offset, string const & fname, + anon_obj(false), + vma_adj(0) + { +- int fd = -1; ++ fd = -1; + struct stat st; + int notes_remaining; + bool spu_note_found = false; +diff --git a/opjitconv/conversion.c b/opjitconv/conversion.c +index 111fe9d..add0f95 100644 +--- a/opjitconv/conversion.c ++++ b/opjitconv/conversion.c +@@ -39,10 +39,10 @@ static void free_jit_debug_line(void) + jitentry_debug_line_list = NULL; + } + +-int op_jit_convert(struct op_jitdump_info file_info, char const * elffile, ++int op_jit_convert(struct op_jitdump_info * file_info, char const * elffile, + unsigned long long start_time, unsigned long long end_time) + { +- void const * jitdump = file_info.dmp_file; ++ void const * jitdump = file_info->dmp_file; + int rc= OP_JIT_CONV_OK; + + entry_count = 0; +@@ -53,7 +53,7 @@ int op_jit_convert(struct op_jitdump_info file_info, char const * elffile, + jitentry_debug_line_list = NULL; + entries_symbols_ascending = entries_address_ascending = NULL; + +- if ((rc = parse_all(jitdump, jitdump + file_info.dmp_file_stat.st_size, ++ if ((rc = parse_all(jitdump, jitdump + file_info->dmp_file_stat.st_size, + end_time)) == OP_JIT_CONV_FAIL) + goto out; + +diff --git a/opjitconv/opjitconv.c b/opjitconv/opjitconv.c +index a9dfa91..9d910be 100644 +--- a/opjitconv/opjitconv.c ++++ b/opjitconv/opjitconv.c +@@ -19,6 +19,7 @@ + #include "op_file.h" + #include "op_libiberty.h" + ++#include + #include + #include + #include +@@ -75,6 +76,19 @@ int debug; + int non_root; + /* indicates we should delete jitdump files owned by the user */ + int delete_jitdumps; ++/* Session directory where sample data is stored */ ++char * session_dir; ++ ++static struct option long_options [] = { ++ { "session-dir", required_argument, NULL, 's'}, ++ { "debug", no_argument, NULL, 'd'}, ++ { "delete-jitdumps", no_argument, NULL, 'j'}, ++ { "non-root", no_argument, NULL, 'n'}, ++ { "help", no_argument, NULL, 'h'}, ++ { NULL, 9, NULL, 0} ++}; ++const char * short_options = "s:djnh"; ++ + LIST_HEAD(jitdump_deletion_candidates); + + /* +@@ -407,7 +421,7 @@ chk_proc_id: + goto free_res3; + } + /* Convert the dump file as the special user 'oprofile'. */ +- rc = op_jit_convert(dmp_info, tmp_elffile, start_time, end_time); ++ rc = op_jit_convert(&dmp_info, tmp_elffile, start_time, end_time); + if (rc < 0) + goto free_res3; + +@@ -772,61 +786,99 @@ static void _cleanup_jitdumps(void) + + } + +-int main(int argc, char ** argv) ++static void __print_usage(const char * extra_msg) ++{ ++ if (extra_msg) ++ fprintf(stderr, extra_msg); ++ fprintf(stderr, "usage: opjitconv [--debug | --non-root | --delete-jitdumps ] --session-dir= \n"); ++} ++ ++static int _process_args(int argc, char * const argv[]) ++{ ++ int keep_trying = 1; ++ int idx_of_non_options = 0; ++ setenv("POSIXLY_CORRECT", "1", 0); ++ while (keep_trying) { ++ int option_idx = 0; ++ int c = getopt_long(argc, argv, short_options, long_options, &option_idx); ++ switch (c) { ++ case -1: ++ if (optind != argc) { ++ idx_of_non_options = optind; ++ } ++ keep_trying = 0; ++ break; ++ case '?': ++ printf("non-option detected at optind %d\n", optind); ++ keep_trying = 0; ++ idx_of_non_options = -1; ++ break; ++ case 's': ++ session_dir = optarg; ++ break; ++ case 'd': ++ debug = 1; ++ break; ++ case 'n': ++ non_root = 1; ++ break; ++ case 'j': ++ delete_jitdumps = 1; ++ break; ++ case 'h': ++ break; ++ default: ++ break; ++ } ++ } ++ return idx_of_non_options; ++} ++ ++int main(int argc, char * const argv[]) + { + unsigned long long start_time, end_time; +- char session_dir[PATH_MAX]; +- int rc = 0; ++ struct stat filestat; ++ int non_options_idx, rc = 0; + size_t sessdir_len = 0; +- char * path_end; + + debug = 0; +- if (argc > 1 && strcmp(argv[1], "-d") == 0) { +- debug = 1; +- argc--; +- argv++; +- } + non_root = 0; +- if (argc > 1 && strcmp(argv[1], "--non-root") == 0) { +- non_root = 1; +- argc--; +- argv++; +- } +- + delete_jitdumps = 0; +- if (argc > 1 && strcmp(argv[1], "--delete-jitdumps") == 0) { +- delete_jitdumps = 1; +- argc--; +- argv++; +- } +- +- if (argc != 4) { +- printf("Usage: opjitconv [-d] " +- " \n"); ++ session_dir = NULL; ++ non_options_idx = _process_args(argc, argv); ++ // We need the session_dir and two non-option values passed -- starttime and endtime. ++ if (!session_dir || (non_options_idx != argc - 2)) { ++ __print_usage(NULL); + fflush(stdout); + rc = EXIT_FAILURE; + goto out; + } + + /* +- * Check for a maximum of 4096 bytes (Linux path name length limit) decremented +- * by 16 bytes (will be used later for appending samples sub directory). ++ * Check for a maximum of 4096 bytes (Linux path name length limit) minus 16 bytes ++ * (to be used later for appending samples sub directory) minus 1 (for terminator). + * Integer overflows according to the session dir parameter (user controlled) + * are not possible anymore. + */ +- path_end = memchr(argv[1], '\0', PATH_MAX); +- if (!path_end || ((sessdir_len = (path_end - argv[1])) >= PATH_MAX - 16)) { ++ if ((sessdir_len = strlen(session_dir)) >= (PATH_MAX - 17)) { + printf("opjitconv: Path name length limit exceeded for session directory\n"); + rc = EXIT_FAILURE; + goto out; + } +- memset(session_dir, '\0', PATH_MAX); +- assert(sessdir_len < (PATH_MAX - 16 - 1)); +- strncpy(session_dir, argv[1], sessdir_len); +- session_dir[PATH_MAX -1] = '\0'; + +- start_time = atol(argv[2]); +- end_time = atol(argv[3]); ++ if (stat(session_dir, &filestat)) { ++ perror("stat operation on passed session-dir failed"); ++ rc = EXIT_FAILURE; ++ goto out; ++ } ++ if (!S_ISDIR(filestat.st_mode)) { ++ printf("Passed session-dir %s is not a directory\n", session_dir); ++ rc = EXIT_FAILURE; ++ goto out; ++ } ++ ++ start_time = atol(argv[non_options_idx++]); ++ end_time = atol(argv[non_options_idx]); + + if (start_time > end_time) { + rc = EXIT_FAILURE; +diff --git a/opjitconv/opjitconv.h b/opjitconv/opjitconv.h +index f6243c9..a3ce37f 100644 +--- a/opjitconv/opjitconv.h ++++ b/opjitconv/opjitconv.h +@@ -99,7 +99,7 @@ int parse_all(void const * start, void const * end, + unsigned long long end_time); + + /* conversion.c */ +-int op_jit_convert(struct op_jitdump_info file_info, char const * elffile, ++int op_jit_convert(struct op_jitdump_info *file_info, char const * elffile, + unsigned long long start_time, unsigned long long end_time); + + /* create_bfd.c */ +diff --git a/pe_profiling/operf.cpp b/pe_profiling/operf.cpp +index 88aed3d..399308f 100644 +--- a/pe_profiling/operf.cpp ++++ b/pe_profiling/operf.cpp +@@ -787,7 +787,7 @@ static void _do_jitdump_convert() + struct timeval tv; + char end_time_str[32]; + char opjitconv_path[PATH_MAX + 1]; +- char * exec_args[8]; ++ char * exec_args[9]; + + jitconv_pid = fork(); + switch (jitconv_pid) { +@@ -799,6 +799,7 @@ static void _do_jitdump_convert() + const char * debug_option = "-d"; + const char * non_root_user = "--non-root"; + const char * delete_jitdumps = "--delete-jitdumps"; ++ const char * sess_dir = "--session-dir"; + gettimeofday(&tv, NULL); + end_time = tv.tv_sec; + sprintf(end_time_str, "%llu", end_time); +@@ -810,6 +811,7 @@ static void _do_jitdump_convert() + if (my_uid != 0) + exec_args[arg_num++] = (char *)non_root_user; + exec_args[arg_num++] = (char *)delete_jitdumps; ++ exec_args[arg_num++] = (char *)sess_dir; + exec_args[arg_num++] = (char *)operf_options::session_dir.c_str(); + exec_args[arg_num++] = start_time_str; + exec_args[arg_num++] = end_time_str; diff --git a/SOURCES/oprofile-defaultmask.patch b/SOURCES/oprofile-defaultmask.patch new file mode 100644 index 0000000..ecd71ac --- /dev/null +++ b/SOURCES/oprofile-defaultmask.patch @@ -0,0 +1,33 @@ +commit fb9529161039e96d44b4b7396450cff04e3d9aa8 +Author: Maynard Johnson +Date: Tue Oct 15 14:58:16 2013 -0500 + + Fix operf/ocount default unit mask selection + + Many events (particularly in the x86* architectures) + require a unit mask value to specify the exact event + type. For such events, a default unit mask value + is assigned. When a user runs operf, ocount, or + opcontrol and specifies such an event but does not + specify a unit mask, the default unit mask should be + selected and used by the tool. A bug was discovered + with operf and ocount where the unit mask value in + this situation was being set to '0' instead of the + default unit mask value. This patch fixes the bug. + + Signed-off-by: Maynard Johnson + +diff --git a/libpe_utils/op_pe_utils.cpp b/libpe_utils/op_pe_utils.cpp +index b85d175..177835e 100644 +--- a/libpe_utils/op_pe_utils.cpp ++++ b/libpe_utils/op_pe_utils.cpp +@@ -484,7 +484,8 @@ handle_named_um: + pclose(fp); + event->evt_um = strtoull(mask, &endptr, 10); + if ((endptr >= mask) && +- (endptr <= (mask + strlen(mask) - 1))) { ++ (endptr <= (mask + strlen(mask) - 2))) { // '- 2' to account for linefeed and '\0' ++ + // Must be a default named unit mask + strncpy(event->um_name, mask, OP_MAX_UM_NAME_LEN); + goto handle_named_um; diff --git a/SOURCES/oprofile-env.patch b/SOURCES/oprofile-env.patch new file mode 100644 index 0000000..1b81a13 --- /dev/null +++ b/SOURCES/oprofile-env.patch @@ -0,0 +1,71 @@ +From b869a61861e161c855379c4b5700fd352da01154 Mon Sep 17 00:00:00 2001 +From: William Cohen +Date: Thu, 8 Jan 2015 16:37:57 -0500 +Subject: [PATCH] Avoid permanently setting POSIXLY_CORRECT environment + variable + +During testing on Fedora it was discovered that operf was setting the +enviroment variable POSIXLY_CORRECT and this could potentially be +observed in the children tasks that operf starts (Red Hat Bugzilla +1178577). The operf, ocount, and opjitconv commands all ensure that +POSIXLY_CORRECT environment variable is set when the options are +processed with getopt_long, but they never unset the variable +afterwards. This patch ensures that POSIXLY_CORRECT is as it was +before it was set. + +Signed-off-by: William Cohen +--- + opjitconv/opjitconv.c | 5 +++++ + pe_counting/ocount.cpp | 5 +++++ + pe_profiling/operf.cpp | 5 +++++ + 3 files changed, 15 insertions(+) + +diff --git a/pe_counting/ocount.cpp b/pe_counting/ocount.cpp +index 07dfd0c..f7caede 100644 +--- a/pe_counting/ocount.cpp ++++ b/pe_counting/ocount.cpp +@@ -579,6 +579,7 @@ static int _process_ocount_and_app_args(int argc, char * const argv[]) + { + bool keep_trying = true; + int idx_of_non_options = 0; ++ char * prev_env = getenv("POSIXLY_CORRECT"); + setenv("POSIXLY_CORRECT", "1", 0); + while (keep_trying) { + int option_idx = 0; +@@ -663,6 +664,10 @@ static int _process_ocount_and_app_args(int argc, char * const argv[]) + __print_usage_and_exit("ocount: unexpected end of arg parsing"); + } + } ++ ++ if (prev_env == NULL) ++ unsetenv("POSIXLY_CORRECT"); ++ + return idx_of_non_options; + } + +diff --git a/pe_profiling/operf.cpp b/pe_profiling/operf.cpp +index 04a25d9..a186278 100644 +--- a/pe_profiling/operf.cpp ++++ b/pe_profiling/operf.cpp +@@ -1258,6 +1258,7 @@ static int _process_operf_and_app_args(int argc, char * const argv[]) + { + bool keep_trying = true; + int idx_of_non_options = 0; ++ char * prev_env = getenv("POSIXLY_CORRECT"); + setenv("POSIXLY_CORRECT", "1", 0); + while (keep_trying) { + int option_idx = 0; +@@ -1331,6 +1332,10 @@ static int _process_operf_and_app_args(int argc, char * const argv[]) + __print_usage_and_exit("unexpected end of arg parsing"); + } + } ++ ++ if (prev_env == NULL) ++ unsetenv("POSIXLY_CORRECT"); ++ + return idx_of_non_options; + } + +-- +2.1.0 + diff --git a/SOURCES/oprofile-extramask.patch b/SOURCES/oprofile-extramask.patch new file mode 100644 index 0000000..6863288 --- /dev/null +++ b/SOURCES/oprofile-extramask.patch @@ -0,0 +1,97 @@ +From dd433306f249db81f1ef5cfffefeb2d0ad4e3115 Mon Sep 17 00:00:00 2001 +From: William Cohen +Date: Tue, 10 Mar 2015 10:52:39 -0400 +Subject: [PATCH] Ensure that umask is set if the extra bits (edge, inv, cmask) + are used + +When testing ocount on some of the Intel processor it was discovered +that that the umask not not being set for events that specified the +the extra bits. Below is an example of the problem on an Intel Ivy +Bridge processor with the event code missing the 0x03 unit masks for +the events: + +$ ocount --verbose -e int_misc:recovery_cycles -e int_misc:recovery_stalls_count ls +Final event code is 140000d +Final event code is 144000d +Number of events passed is 2 +Exec args are: ls +telling child to start app +parent says start app /usr/bin/ls +calling perf_event_open for pid 240d +perf_event_open returning fd 9 +perf_event_open returning fd a +perf counter setup complete +app 240d is running +going into waitpid on monitored app 240d +app process ended normally. +Reading counter data for event int_misc +Reading counter data for event int_misc + +Events were actively counted for 1070382 nanoseconds. +Event counts (actual) for /usr/bin/ls: + Event Count % time counted + int_misc:recovery_cycles 0 100.00 + int_misc:recovery_stalls_count 0 100.00 + +With this patch the umasks are included and the example executes correctly: + +$ ocount --verbose -e int_misc:recovery_cycles -e int_misc:recovery_stalls_count ls +Final event code is 140030d +Final event code is 144030d +Number of events passed is 2 +Exec args are: ls +telling child to start app +calling perf_event_open for pid 72e1 +parent says start app /usr/bin/ls +perf_event_open returning fd 9 +perf_event_open returning fd a +perf counter setup complete +app 72e1 is running +going into waitpid on monitored app 72e1 +app process ended normally. +Reading counter data for event int_misc +Reading counter data for event int_misc + +Events were actively counted for 1216948 nanoseconds. +Event counts (actual) for /usr/bin/ls: + Event Count % time counted + int_misc:recovery_cycles 69,730 100.00 + int_misc:recovery_stalls_count 14,800 100.00 + +Signed-off-by: William Cohen +--- + libop/op_events.c | 3 +++ + libop/op_events.h | 3 +++ + 2 files changed, 6 insertions(+) + +diff --git a/libop/op_events.c b/libop/op_events.c +index 99266c6..2badc8e 100644 +--- a/libop/op_events.c ++++ b/libop/op_events.c +@@ -238,6 +238,9 @@ static void parse_um_entry(struct op_described_um * entry, char const * line) + if (strisprefix(c, "extra:")) { + c += 6; + entry->extra = parse_extra(c); ++ /* include the regular umask if there are real extra bits */ ++ if (entry->extra != EXTRA_NONE) ++ entry->extra |= (entry->value & UMASK_MASK) << UMASK_SHIFT; + /* named mask */ + c = skip_nonws(c); + c = skip_ws(c); +diff --git a/libop/op_events.h b/libop/op_events.h +index ec345e5..f09c830 100644 +--- a/libop/op_events.h ++++ b/libop/op_events.h +@@ -20,6 +20,9 @@ extern "C" { + #include "op_types.h" + #include "op_list.h" + ++#define UMASK_SHIFT 8 ++#define UMASK_MASK 0xff ++ + #define EXTRA_EDGE (1U << 18) + #define EXTRA_MIN_VAL EXTRA_EDGE + +-- +2.1.0 + diff --git a/SOURCES/oprofile-goldmont.patch b/SOURCES/oprofile-goldmont.patch new file mode 100644 index 0000000..d88e660 --- /dev/null +++ b/SOURCES/oprofile-goldmont.patch @@ -0,0 +1,584 @@ +commit 0ad5a9e6af86a88e1dd41180f45bc48b646eba6a +Author: Andi Kleen +Date: Tue Apr 26 07:52:51 2016 -0700 + + oprofile: Add support for Goldmont events + + Add support for the Intel Goldmont events. + + OFFCORE_RESPONSE.* is not supported. + + v2: Fix typos in descriptions. + v3: Add inst_retired.any_pebs + Signed-off-by: Andi Kleen + +diff --git a/events/Makefile.am b/events/Makefile.am +index 56f9020..677b05f 100644 +--- a/events/Makefile.am ++++ b/events/Makefile.am +@@ -20,6 +20,7 @@ event_files = \ + i386/broadwell/events i386/broadwell/unit_masks \ + i386/skylake/events i386/skylake/unit_masks \ + i386/silvermont/events i386/silvermont/unit_masks \ ++ i386/goldmont/events i386/goldmont/unit_masks \ + ia64/ia64/events ia64/ia64/unit_masks \ + ia64/itanium2/events ia64/itanium2/unit_masks \ + ia64/itanium/events ia64/itanium/unit_masks \ + +diff --git a/events/i386/goldmont/events b/events/i386/goldmont/events +new file mode 100644 +index 0000000..111438e +--- /dev/null ++++ b/events/i386/goldmont/events +@@ -0,0 +1,34 @@ ++# ++# Intel "Goldmont" microarchitecture core events. ++# ++# See http://ark.intel.com/ for help in identifying Goldmont based CPUs ++# ++# Note the minimum counts are not discovered experimentally and could be likely ++# lowered in many cases without ill effect. ++# ++event:0x00 counters:cpuid um:cpu_clk_unhalted minimum:2000003 name:cpu_clk_unhalted : ++event:0x03 counters:cpuid um:ld_blocks minimum:200003 name:ld_blocks : ++event:0x05 counters:cpuid um:page_walks minimum:200003 name:page_walks : ++event:0x0e counters:cpuid um:uops_issued minimum:200003 name:uops_issued_any : ++event:0x13 counters:cpuid um:misalign_mem_ref minimum:200003 name:misalign_mem_ref : ++event:0x2e counters:cpuid um:longest_lat_cache minimum:200003 name:longest_lat_cache : ++event:0x30 counters:cpuid um:l2_reject_xq minimum:200003 name:l2_reject_xq_all : ++event:0x31 counters:cpuid um:core_reject_l2q minimum:200003 name:core_reject_l2q_all : ++event:0x51 counters:cpuid um:dl1 minimum:200003 name:dl1_dirty_eviction : ++event:0x80 counters:cpuid um:icache minimum:200003 name:icache : ++event:0x81 counters:cpuid um:itlb minimum:200003 name:itlb_miss : ++event:0x86 counters:cpuid um:fetch_stall minimum:200003 name:fetch_stall_icache_fill_pending_cycles : ++event:0x9c counters:cpuid um:uops_not_delivered minimum:200003 name:uops_not_delivered_any : ++event:0xc0 counters:cpuid um:inst_retired minimum:2000003 name:inst_retired : ++event:0xc2 counters:cpuid um:uops_retired minimum:2000003 name:uops_retired : ++event:0xc3 counters:cpuid um:machine_clears minimum:200003 name:machine_clears : ++event:0xc4 counters:cpuid um:br_inst_retired minimum:200003 name:br_inst_retired : ++event:0xc5 counters:cpuid um:br_misp_retired minimum:200003 name:br_misp_retired : ++event:0xca counters:cpuid um:issue_slots_not_consumed minimum:200003 name:issue_slots_not_consumed : ++event:0xcb counters:cpuid um:hw_interrupts minimum:200003 name:hw_interrupts : ++event:0xcd counters:cpuid um:cycles_div_busy minimum:2000003 name:cycles_div_busy : ++event:0xd0 counters:cpuid um:mem_uops_retired minimum:200003 name:mem_uops_retired : ++event:0xd1 counters:cpuid um:mem_load_uops_retired minimum:200003 name:mem_load_uops_retired : ++event:0xe6 counters:cpuid um:baclears minimum:200003 name:baclears : ++event:0xe7 counters:cpuid um:ms_decoded minimum:200003 name:ms_decoded_ms_entry : ++event:0xe9 counters:cpuid um:decode_restriction minimum:200003 name:decode_restriction_predecode_wrong : +diff --git a/events/i386/goldmont/unit_masks b/events/i386/goldmont/unit_masks +new file mode 100644 +index 0000000..2f265b3 +--- /dev/null ++++ b/events/i386/goldmont/unit_masks +@@ -0,0 +1,155 @@ ++# ++# Unit masks for the Intel "Goldmont" micro architecture ++# ++# See http://ark.intel.com/ for help in identifying Goldmont based CPUs ++# ++name:core_reject_l2q type:mandatory default:0x0 ++ 0x0 extra: all Counts the number of demand and L1 prefetcher requests rejected by the L2Q due to a full or nearly full condition which likely indicates back pressure from L2Q. It also counts requests that would have gone directly to the XQ, but are rejected due to a full or nearly full condition, indicating back pressure from the IDI link. The L2Q may also reject transactions from a core to insure fairness between cores, or to delay a core's dirty eviction when the address conflicts with incoming external snoops. ++name:decode_restriction type:mandatory default:0x1 ++ 0x1 extra: predecode_wrong Counts the number of times the prediction (from the predecode cache) for instruction length is incorrect. ++name:dl1 type:mandatory default:0x1 ++ 0x1 extra: dirty_eviction Counts when a modified (dirty) cache line is evicted from the data L1 cache and needs to be written back to memory. No count will occur if the evicted line is clean, and hence does not require a writeback. ++name:fetch_stall type:mandatory default:0x2 ++ 0x2 extra: icache_fill_pending_cycles Counts the number of cycles fetch stalls because of an icache miss. This is a cummulative count of cycles stalled for all icache misses. ++name:itlb type:mandatory default:0x4 ++ 0x4 extra: miss Counts the number of times the machine was unable to find a translation in the Instruction Translation Lookaside Buffer (ITLB) for a linear address of an instruction fetch. It counts when new translation are filled into the ITLB. The event is speculative in nature, but will not count translations (page walks) that are begun and not finished, or translations that are finished but not filled into the ITLB. ++name:l2_reject_xq type:mandatory default:0x0 ++ 0x0 extra: all Counts the number of demand and prefetch transactions that the L2 XQ rejects due to a full or near full condition which likely indicates back pressure from the intra-die interconnect (IDI) fabric. The XQ may reject transactions from the L2Q (non-cacheable requests), L2 misses and L2 write-back victims. ++name:ms_decoded type:mandatory default:0x1 ++ 0x1 extra: ms_entry Counts the number of times the Microcde Sequencer (MS) starts a flow of uops from the MSROM. It does not count every time a uop is read from the MSROM. The most common case that this counts is when a micro-coded instruction is encountered by the front end of the machine. Other cases include when an instruction encounters a fault, trap, or microcode assist of any sort that initiates a flow of uops. The event will count MS startups for uops that are speculative, and subsequently cleared by branch mispredict or a machine clear. ++name:uops_issued type:mandatory default:0x0 ++ 0x0 extra: any Counts uops issued by the front end and allocated into the back end of the machine. This event counts uops that retire as well as uops that were speculatively executed but didn't retire. The sort of speculative uops that might be counted includes, but is not limited to those uops issued in the shadow of a miss-predicted branch, those uops that are inserted during an assist (such as for a denormal floating point result), and (previously allocated) uops that might be canceled during a machine clear. ++name:uops_not_delivered type:mandatory default:0x0 ++ 0x0 extra: any This event used to measure front-end inefficiencies. I.e. when front-end of the machine is not delivering uops to the back-end and the back-end has is not stalled. This event can be used to identify if the machine is truly front-end bound. When this event occurs, it is an indication that the front-end of the machine is operating at less than its theoretical peak performance. ++name:cpu_clk_unhalted type:exclusive default:core ++ 0x2 extra: core Counts the number of core cycles while the core is not in a halt state. The core enters the halt state when it is running the HLT instruction. In mobile systems the core frequency may change from time to time. For this reason this event may have a changing ratio with regards to time. This event uses fixed counter 1. You cannot collect a PEBs record for this event. ++ 0x1 extra: ref_tsc Counts the number of reference cycles that the core is not in a halt state. The core enters the halt state when it is running the HLT instruction. In mobile systems the core frequency may change from time. This event is not affected by core frequency changes but counts as if the core is running at the maximum frequency all the time. This event uses fixed counter 2. You cannot collect a PEBs record for this event ++ 0x0 extra: core_p Core cycles when core is not halted. This event uses a (_P)rogrammable general purpose performance counter. ++ 0x1 extra: ref Reference cycles when core is not halted. This event uses a (_P)rogrammable general purpose performance counter. ++name:ld_blocks type:exclusive default:all_block ++ 0x10 extra: all_block Counts anytime a load that retires is blocked for any reason. ++ 0x10 extra:pebs all_block_pebs Counts anytime a load that retires is blocked for any reason. ++ 0x8 extra: utlb_miss Counts loads blocked because they are unable to find their physical address in the micro TLB (UTLB). ++ 0x8 extra:pebs utlb_miss_pebs Counts loads blocked because they are unable to find their physical address in the micro TLB (UTLB). ++ 0x1 extra: data_unknown Counts a load blocked from using a store forward, but did not occur because the store data was not available at the right time. The forward might occur subsequently when the data is available. ++ 0x1 extra:pebs data_unknown_pebs Counts a load blocked from using a store forward, but did not occur because the store data was not available at the right time. The forward might occur subsequently when the data is available. ++ 0x4 extra: u4k_alias Counts loads that block because their address modulo 4K matches a pending store. ++ 0x4 extra:pebs u4k_alias_pebs Counts loads that block because their address modulo 4K matches a pending store. ++name:page_walks type:exclusive default:0x1 ++ 0x1 extra: d_side_cycles Counts every core cycle when a Data-side walks (due to data operation) page walk is in progress. ++ 0x2 extra: i_side_cycles Counts every core cycle when a Instruction-side (walks due to an instruction fetch) page walk is in progress. ++ 0x3 extra: cycles Counts every core cycle a page-walk is in progress due to either a data memory operation or an instruction fetch. ++name:misalign_mem_ref type:exclusive default:load_page_split ++ 0x2 extra: load_page_split Counts when a memory load of a uop spans a page boundary (a split) is retired. ++ 0x2 extra:pebs load_page_split_pebs Counts when a memory load of a uop spans a page boundary (a split) is retired. ++ 0x4 extra: store_page_split Counts when a memory store of a uop spans a page boundary (a split) is retired. ++ 0x4 extra:pebs store_page_split_pebs Counts when a memory store of a uop spans a page boundary (a split) is retired. ++name:longest_lat_cache type:exclusive default:0x4f ++ 0x4f extra: reference Counts memory requests originating from the core that reference a cache line in the L2 cache. ++ 0x41 extra: miss Counts memory requests originating from the core that miss in the L2 cache. ++name:icache type:exclusive default:0x1 ++ 0x1 extra: hit Counts each cache line access to the Icache that are fulfilled (hit) by the Icache ++ 0x2 extra: misses Counts each cache line access to the Icache that are not fullfilled (miss) by the Icache ++ 0x3 extra: accesses Counts each cache line access to the Icache ++name:inst_retired type:exclusive default:any ++ 0x0 extra: any Counts the number of instructions that retire execution. For instructions that consist of multiple uops, this event counts the retirement of the last uop of the instruction. The counter continues counting during hardware interrupts, traps, and inside interrupt handlers. This event uses fixed counter 0. You cannot collect a PEBs record for this event ++ 0x0 extra: any_p Counts the number of instructions that retire execution. For instructions that consist of multiple uops, this event counts the retirement of the last uop of the instruction. The event continues counting during hardware interrupts, traps, and inside interrupt handlers. This is an architectural performance event. This event uses a (_P)rogrammable general purpose performance counter. ++ 0x0 extra:pebs any_pebs Counts the number of instructions that retire execution. For instructions that consist of multiple uops, this event counts the retirement of the last uop of the instruction. The event continues counting during hardware interrupts, traps, and inside interrupt handlers. This is an architectural performance event. This event uses a (_P)rogrammable general purpose performance counter. *This event is Precise Event capable: The EventingRIP field in the PEBS record is precise to the address of the instruction which caused the event. Note: Because PEBS records can be collected only on IA32_PMC0, only one event can use the PEBS facility at a time. ++name:uops_retired type:exclusive default:any ++ 0x0 extra: any Counts uops which retired ++ 0x0 extra:pebs any_pebs Counts uops which retired ++ 0x1 extra: ms Counts uops retired that are from the complex flows issued by the micro-sequencer (MS). Counts both the uops from a micro-coded instruction, and the uops that might be generated from a micro-coded assist. ++ 0x1 extra:pebs ms_pebs Counts uops retired that are from the complex flows issued by the micro-sequencer (MS). Counts both the uops from a micro-coded instruction, and the uops that might be generated from a micro-coded assist. ++ 0x8 extra: fpdiv Counts the number of floating point divide uops retired. ++ 0x8 extra:pebs fpdiv_pebs Counts the number of floating point divide uops retired. ++ 0x10 extra: idiv Counts the number of integer divide uops retired. ++ 0x10 extra:pebs idiv_pebs Counts the number of integer divide uops retired. ++name:machine_clears type:exclusive default:0x0 ++ 0x0 extra: all Counts machine clears for any reason ++ 0x1 extra: smc Counts the number of times that the processor detects that a program is writing to a code section and has to perform a machine clear because of that modification. Self-modifying code (SMC) causes a severe penalty in all Intel architecture processors. ++ 0x2 extra: memory_ordering Counts machine clears due to memory ordering issues. This occurs when a snoop request happens and the machine is uncertain if memory ordering will be preserved, as another core is in the process of modifying the data. ++ 0x4 extra: fp_assist Counts machine clears due to floating point (FP) operations needing assists. For instance, if the result was a floating point denormal, the hardware clears the pipeline and reissues uops to produce the correct IEEE compliant denormal result. ++ 0x8 extra: disambiguation Counts machine clears due to memory disambiguation. Memory disambiguation happens when a load which has been issued conflicts with a previous unretired store in the pipeline whose address was not known at issue time, but is later resolved to be the same as the load address. ++name:br_inst_retired type:exclusive default:all_branches ++ 0x0 extra: all_branches Counts branch instructions retired for all branch types. This is an architectural performance event. ++ 0x0 extra:pebs all_branches_pebs Counts branch instructions retired for all branch types. This is an architectural performance event. ++ 0x7e extra: jcc Counts retired Jcc (Jump on Conditional Code/Jump if Conditon is Met) branch instructions retired, including both when the branch was taken and when it was not taken. ++ 0x7e extra:pebs jcc_pebs Counts retired Jcc (Jump on Conditional Code/Jump if Conditon is Met) branch instructions retired, including both when the branch was taken and when it was not taken. ++ 0xfe extra: taken_jcc Counts Jcc (Jump on Conditional Code/Jump if Conditon is Met) branch instructions retired that were taken and does not count when the Jcc branch instruction were not taken. ++ 0xfe extra:pebs taken_jcc_pebs Counts Jcc (Jump on Conditional Code/Jump if Conditon is Met) branch instructions retired that were taken and does not count when the Jcc branch instruction were not taken. ++ 0xf9 extra: call Counts near CALL branch instructions retired. ++ 0xf9 extra:pebs call_pebs Counts near CALL branch instructions retired. ++ 0xfd extra: rel_call Counts near relative CALL branch instructions retired. ++ 0xfd extra:pebs rel_call_pebs Counts near relative CALL branch instructions retired. ++ 0xfb extra: ind_call Counts near indirect CALL branch instructions retired. ++ 0xfb extra:pebs ind_call_pebs Counts near indirect CALL branch instructions retired. ++ 0xf7 extra: return Counts near return branch instructions retired. ++ 0xf7 extra:pebs return_pebs Counts near return branch instructions retired. ++ 0xeb extra: non_return_ind Counts near indirect call or near indirect jmp branch instructions retired. ++ 0xeb extra:pebs non_return_ind_pebs Counts near indirect call or near indirect jmp branch instructions retired. ++ 0xbf extra: far_branch Counts far branch instructions retired. This includes far jump, far call and return, and Interrupt call and return. Intel Architecture uses far branches to transition to a different privilege level (ex: kernel/user). ++ 0xbf extra:pebs far_branch_pebs Counts far branch instructions retired. This includes far jump, far call and return, and Interrupt call and return. Intel Architecture uses far branches to transition to a different privilege level (ex: kernel/user). ++name:br_misp_retired type:exclusive default:all_branches ++ 0x0 extra: all_branches Counts mispredicted branch instructions retired including all branch types. ++ 0x0 extra:pebs all_branches_pebs Counts mispredicted branch instructions retired including all branch types. ++ 0x7e extra: jcc Counts mispredicted retired Jcc (Jump on Conditional Code/Jump if Conditon is Met) branch instructions retired, including both when the branch was supposed to be taken and when it was not supposed to be taken (but the processor predicted the opposite condition). ++ 0x7e extra:pebs jcc_pebs Counts mispredicted retired Jcc (Jump on Conditional Code/Jump if Conditon is Met) branch instructions retired, including both when the branch was supposed to be taken and when it was not supposed to be taken (but the processor predicted the opposite condition). ++ 0xfe extra: taken_jcc Counts mispredicted retired Jcc (Jump on Conditional Code/Jump if Condition is Met) branch instructions retired that were supposed to be taken but the processor predicted that it would not be taken. ++ 0xfe extra:pebs taken_jcc_pebs Counts mispredicted retired Jcc (Jump on Conditional Code/Jump if Condition is Met) branch instructions retired that were supposed to be taken but the processor predicted that it would not be taken. ++ 0xfb extra: ind_call Counts mispredicted near indirect CALL branch instructions retired, where the target address taken was not what the processor predicted. ++ 0xfb extra:pebs ind_call_pebs Counts mispredicted near indirect CALL branch instructions retired, where the target address taken was not what the processor predicted. ++ 0xf7 extra: return Counts mispredicted near RET branch instructions retired, where the return address taken was not what the processor predicted. ++ 0xf7 extra:pebs return_pebs Counts mispredicted near RET branch instructions retired, where the return address taken was not what the processor predicted. ++ 0xeb extra: non_return_ind Counts mispredicted branch instructions retired that were near indirect call or near indirect jmp, where the target address taken was not what the processor predicted. ++ 0xeb extra:pebs non_return_ind_pebs Counts mispredicted branch instructions retired that were near indirect call or near indirect jmp, where the target address taken was not what the processor predicted. ++name:issue_slots_not_consumed type:exclusive default:0x0 ++ 0x0 extra: any Counts the number of issue slots per core cycle that were not consumed by the backend due to either a full resource in the backend (RESOURCE_FULL) or due to the processor recovering from some event (RECOVERY) ++ 0x1 extra: resource_full Counts the number of issue slots per core cycle that were not consumed because of a full resource in the backend. Including but not limited the Re-order Buffer (ROB), reservation stations (RS), load/store buffers, physical registers, or any other needed machine resource that is currently unavailable. Note that uops must be available for consumption in order for this event to fire. If a uop is not available (Instruction Queue is empty), this event will not count. ++ 0x2 extra: recovery Counts the number of issue slots per core cycle that were not consumed by the backend because allocation is stalled waiting for a mispredicted jump to retire or other branch-like conditions (e.g. the event is relevant during certain microcode flows). Counts all issue slots blocked while within this window including slots where uops were not available in the Instruction Queue. ++name:hw_interrupts type:exclusive default:0x1 ++ 0x1 extra: received Counts hardware interrupts received by the processor. ++ 0x4 extra: pending_and_masked Counts core cycles during which there are pending interrupts, but interrupts are masked (EFLAGS.IF = 0). ++name:cycles_div_busy type:exclusive default:0x0 ++ 0x0 extra: all Counts core cycles if either divide unit is busy. ++ 0x1 extra: idiv Counts core cycles the integer divide unit is busy. ++ 0x2 extra: fpdiv Counts core cycles the floating point divide unit is busy. ++name:mem_uops_retired type:exclusive default:all ++ 0x83 extra: all Counts the number of memory uops retired that is either a loads or a store or both. ++ 0x81 extra: all_loads Counts the number of load uops retired ++ 0x81 extra:pebs all_loads_pebs Counts the number of load uops retired ++ 0x82 extra: all_stores Counts the number of store uops retired ++ 0x82 extra:pebs all_stores_pebs Counts the number of store uops retired ++ 0x83 extra:pebs all_pebs Counts the number of memory uops retired that is either a loads or a store or both. ++ 0x11 extra: dtlb_miss_loads Counts load uops retired that caused a DTLB miss. ++ 0x11 extra:pebs dtlb_miss_loads_pebs Counts load uops retired that caused a DTLB miss. ++ 0x12 extra: dtlb_miss_stores Counts store uops retired that caused a DTLB miss. ++ 0x12 extra:pebs dtlb_miss_stores_pebs Counts store uops retired that caused a DTLB miss. ++ 0x13 extra: dtlb_miss Counts uops retired that had a DTLB miss on load, store or either. Note that when two distinct memory operations to the same page miss the DTLB, only one of them will be recorded as a DTLB miss. ++ 0x13 extra:pebs dtlb_miss_pebs Counts uops retired that had a DTLB miss on load, store or either. Note that when two distinct memory operations to the same page miss the DTLB, only one of them will be recorded as a DTLB miss. ++ 0x21 extra: lock_loads Counts locked memory uops retired. This includes "regular" locks and bus locks. (To specifically count bus locks only, see the Offcore response event.) A locked access is one with a lock prefix, or an exchange to memory. See the SDM for a complete description of which memory load accesses are locks. ++ 0x21 extra:pebs lock_loads_pebs Counts locked memory uops retired. This includes "regular" locks and bus locks. (To specifically count bus locks only, see the Offcore response event.) A locked access is one with a lock prefix, or an exchange to memory. See the SDM for a complete description of which memory load accesses are locks. ++ 0x41 extra: split_loads Counts load uops retired where the data requested spans a 64 byte cache line boundry. ++ 0x41 extra:pebs split_loads_pebs Counts load uops retired where the data requested spans a 64 byte cache line boundry. ++ 0x42 extra: split_stores Counts store uops retired where the data requested spans a 64 byte cache line boundry. ++ 0x42 extra:pebs split_stores_pebs Counts store uops retired where the data requested spans a 64 byte cache line boundry. ++ 0x43 extra: split Counts memory uops retired where the data requested spans a 64 byte cache line boundry. ++ 0x43 extra:pebs split_pebs Counts memory uops retired where the data requested spans a 64 byte cache line boundry. ++name:mem_load_uops_retired type:exclusive default:l1_hit ++ 0x1 extra: l1_hit Counts load uops retired that hit the L1 data cache ++ 0x1 extra:pebs l1_hit_pebs Counts load uops retired that hit the L1 data cache ++ 0x8 extra: l1_miss Counts load uops retired that miss the L1 data cache ++ 0x8 extra:pebs l1_miss_pebs Counts load uops retired that miss the L1 data cache ++ 0x2 extra: l2_hit Counts load uops retired that hit in the L2 cache ++ 0x2 extra:pebs l2_hit_pebs Counts load uops retired that hit in the L2 cache ++ 0x10 extra: l2_miss Counts load uops retired that miss in the L2 cache ++ 0x10 extra:pebs l2_miss_pebs Counts load uops retired that miss in the L2 cache ++ 0x20 extra: hitm Counts load uops retired where the cache line containing the data was in the modified state of another core or modules cache (HITM). More specifically, this means that when the load address was checked by other caching agents (typically another processor) in the system, one of those caching agents indicated that they had a dirty copy of the data. Loads that obtain a HITM response incur greater latency than most is typical for a load. In addition, since HITM indicates that some other processor had this data in its cache, it implies that the data was shared between processors, or potentially was a lock or semaphore value. This event is useful for locating sharing, false sharing, and contended locks. ++ 0x20 extra:pebs hitm_pebs Counts load uops retired where the cache line containing the data was in the modified state of another core or modules cache (HITM). More specifically, this means that when the load address was checked by other caching agents (typically another processor) in the system, one of those caching agents indicated that they had a dirty copy of the data. Loads that obtain a HITM response incur greater latency than most is typical for a load. In addition, since HITM indicates that some other processor had this data in its cache, it implies that the data was shared between processors, or potentially was a lock or semaphore value. This event is useful for locating sharing, false sharing, and contended locks. ++ 0x40 extra: wcb_hit Counts memory load uops retired where the data is retrieved from the WCB (or fill buffer), indicating that the load found its data while that data was in the process of being brought into the L1 cache. Typically a load will receive this indication when some other load or prefetch missed the L1 cache and was in the process of retrieving the cache line containing the data , but that process had not yet finished (and written the data back to the cache). For example, consider load X and Y, both referencing the same cache line that is not in the L1 cache. If load X misses cache first, it obtains and WCB (or fill buffer) and begins the process of requesting the data. When load Y requests the data, it will either hit the WCB, or the L1 cache, depending on exactly what time the request to Y occurs. ++ 0x40 extra:pebs wcb_hit_pebs Counts memory load uops retired where the data is retrieved from the WCB (or fill buffer), indicating that the load found its data while that data was in the process of being brought into the L1 cache. Typically a load will receive this indication when some other load or prefetch missed the L1 cache and was in the process of retrieving the cache line containing the data , but that process had not yet finished (and written the data back to the cache). For example, consider load X and Y, both referencing the same cache line that is not in the L1 cache. If load X misses cache first, it obtains and WCB (or fill buffer) and begins the process of requesting the data. When load Y requests the data, it will either hit the WCB, or the L1 cache, depending on exactly what time the request to Y occurs. ++ 0x80 extra: dram_hit Counts memory load uops retired where the data is retrieved from DRAM. Event is counted at retirment, so the speculative loads are ignored. A memory load can hit (or miss) the L1 cache, hit (or miss) the L2 cache, hit DRAM, hit in the WCB or receive a HITM response. ++ 0x80 extra:pebs dram_hit_pebs Counts memory load uops retired where the data is retrieved from DRAM. Event is counted at retirment, so the speculative loads are ignored. A memory load can hit (or miss) the L1 cache, hit (or miss) the L2 cache, hit DRAM, hit in the WCB or receive a HITM response. ++name:baclears type:exclusive default:0x1 ++ 0x1 extra: all Counts the number of times a BACLEAR is signaled for any reason, including, but not limited to indirect branch/call, Jcc (Jump on Conditional Code/Jump if Condition is Met) branch, unconditional branch/call, and returns. ++ 0x8 extra: return Counts BACLEARS on return instructions. ++ 0x10 extra: cond Counts BACLEARS on Jcc (Jump on Conditional Code/Jump if Conditon is Met) branches. +diff --git a/libop/op_cpu_type.c b/libop/op_cpu_type.c +index b1d5ecf..7bdde53 100644 +--- a/libop/op_cpu_type.c ++++ b/libop/op_cpu_type.c +@@ -122,6 +122,7 @@ static struct cpu_descr const cpu_descrs[MAX_CPU_TYPE] = { + { "ARM Cortex-A57", "arm/armv8-ca57", CPU_ARM_V8_CA57, 6}, + { "ARM Cortex-A53", "arm/armv8-ca53", CPU_ARM_V8_CA53, 6}, + { "Intel Skylake microarchitecture", "i386/skylake", CPU_SKYLAKE, 4 }, ++ { "Intel Goldmont microarchitecture", "i386/goldmont", CPU_GOLDMONT, 4 }, + }; + + static size_t const nr_cpu_descrs = sizeof(cpu_descrs) / sizeof(struct cpu_descr); +@@ -739,6 +740,7 @@ op_cpu op_cpu_base_type(op_cpu cpu_type) + case CPU_HASWELL: + case CPU_BROADWELL: + case CPU_SKYLAKE: ++ case CPU_GOLDMONT: + case CPU_SILVERMONT: + case CPU_WESTMERE: + case CPU_SANDYBRIDGE: +diff --git a/libop/op_cpu_type.h b/libop/op_cpu_type.h +index 9983f87..98289c5 100644 +--- a/libop/op_cpu_type.h ++++ b/libop/op_cpu_type.h +@@ -102,6 +102,7 @@ typedef enum { + CPU_ARM_V8_CA57, /* ARM Cortex-A57 */ + CPU_ARM_V8_CA53, /* ARM Cortex-A53 */ + CPU_SKYLAKE, /** < Intel Skylake microarchitecture */ ++ CPU_GOLDMONT, /** < Intel Goldmont microarchitecture */ + MAX_CPU_TYPE + } op_cpu; + +diff --git a/libop/op_events.c b/libop/op_events.c +index 25f010e..cdd0409 100644 +--- a/libop/op_events.c ++++ b/libop/op_events.c +@@ -1212,6 +1212,7 @@ void op_default_event(op_cpu cpu_type, struct op_default_event_descr * descr) + descr->name = "CPU_CLK_UNHALTED"; + break; + ++ case CPU_GOLDMONT: + case CPU_SKYLAKE: + descr->name = "cpu_clk_unhalted"; + break; +diff --git a/libop/op_hw_specific.h b/libop/op_hw_specific.h +index a6180f4..f4db8f5 100644 +--- a/libop/op_hw_specific.h ++++ b/libop/op_hw_specific.h +@@ -162,6 +162,9 @@ static inline op_cpu op_cpu_specific_type(op_cpu cpu_type) + case 0x4d: + case 0x4c: + return CPU_SILVERMONT; ++ case 0x5c: ++ case 0x5f: ++ return CPU_GOLDMONT; + } + } + return cpu_type; +diff --git a/utils/ophelp.c b/utils/ophelp.c +index fdddddc..5821593 100644 +--- a/utils/ophelp.c ++++ b/utils/ophelp.c +@@ -544,6 +544,7 @@ int main(int argc, char const * argv[]) + case CPU_BROADWELL: + case CPU_SKYLAKE: + case CPU_SILVERMONT: ++ case CPU_GOLDMONT: + case CPU_WESTMERE: + case CPU_SANDYBRIDGE: + case CPU_IVYBRIDGE: +commit 6f2758a46554f69403e2ebc1a3e4a58350682638 +Author: Andi Kleen +Date: Fri May 6 12:11:46 2016 -0700 + + oprofile: Update Goldmont events + + This patch adds some updates to the Goldmont events. Mainly it is editorial updates + to the event descriptions. In addition it also removes the events not listed + in the SDM (which were not intended to be included) + + v2: Minor edits + Signed-off-by: Andi Kleen + +diff --git a/events/i386/goldmont/unit_masks b/events/i386/goldmont/unit_masks +index 2f265b3..d1c08d4 100644 +--- a/events/i386/goldmont/unit_masks ++++ b/events/i386/goldmont/unit_masks +@@ -10,17 +10,17 @@ name:decode_restriction type:mandatory default:0x1 + name:dl1 type:mandatory default:0x1 + 0x1 extra: dirty_eviction Counts when a modified (dirty) cache line is evicted from the data L1 cache and needs to be written back to memory. No count will occur if the evicted line is clean, and hence does not require a writeback. + name:fetch_stall type:mandatory default:0x2 +- 0x2 extra: icache_fill_pending_cycles Counts the number of cycles fetch stalls because of an icache miss. This is a cummulative count of cycles stalled for all icache misses. ++ 0x2 extra: icache_fill_pending_cycles Counts cycles that an ICache miss is outstanding, and instruction fetch is stalled. That is, the decoder queue is able to accept bytes, but the fetch unit is unable to provide bytes, while an Icache miss outstanding. Note this event is not the same as cycles to retrieve an instruction due to an Icache miss. Rather, it is the part of the Instruction Cache (ICache) miss time where no bytes are available for the decoder. + name:itlb type:mandatory default:0x4 + 0x4 extra: miss Counts the number of times the machine was unable to find a translation in the Instruction Translation Lookaside Buffer (ITLB) for a linear address of an instruction fetch. It counts when new translation are filled into the ITLB. The event is speculative in nature, but will not count translations (page walks) that are begun and not finished, or translations that are finished but not filled into the ITLB. + name:l2_reject_xq type:mandatory default:0x0 + 0x0 extra: all Counts the number of demand and prefetch transactions that the L2 XQ rejects due to a full or near full condition which likely indicates back pressure from the intra-die interconnect (IDI) fabric. The XQ may reject transactions from the L2Q (non-cacheable requests), L2 misses and L2 write-back victims. + name:ms_decoded type:mandatory default:0x1 +- 0x1 extra: ms_entry Counts the number of times the Microcde Sequencer (MS) starts a flow of uops from the MSROM. It does not count every time a uop is read from the MSROM. The most common case that this counts is when a micro-coded instruction is encountered by the front end of the machine. Other cases include when an instruction encounters a fault, trap, or microcode assist of any sort that initiates a flow of uops. The event will count MS startups for uops that are speculative, and subsequently cleared by branch mispredict or a machine clear. ++ 0x1 extra: ms_entry Counts the number of times the Microcode Sequencer (MS) starts a flow of uops from the MSROM. It does not count every time a uop is read from the MSROM. The most common case that this counts is when a micro-coded instruction is encountered by the front end of the machine. Other cases include when an instruction encounters a fault, trap, or microcode assist of any sort that initiates a flow of uops. The event will count MS startups for uops that are speculative, and subsequently cleared by branch mispredict or a machine clear. + name:uops_issued type:mandatory default:0x0 + 0x0 extra: any Counts uops issued by the front end and allocated into the back end of the machine. This event counts uops that retire as well as uops that were speculatively executed but didn't retire. The sort of speculative uops that might be counted includes, but is not limited to those uops issued in the shadow of a miss-predicted branch, those uops that are inserted during an assist (such as for a denormal floating point result), and (previously allocated) uops that might be canceled during a machine clear. + name:uops_not_delivered type:mandatory default:0x0 +- 0x0 extra: any This event used to measure front-end inefficiencies. I.e. when front-end of the machine is not delivering uops to the back-end and the back-end has is not stalled. This event can be used to identify if the machine is truly front-end bound. When this event occurs, it is an indication that the front-end of the machine is operating at less than its theoretical peak performance. ++ 0x0 extra: any This event used to measure front-end inefficiencies. I.e. when front-end of the machine is not delivering uops to the back-end and the back-end has is not stalled. This event can be used to identify if the machine is truly front-end bound. When this event occurs, it is an indication that the front-end of the machine is operating at less than its theoretical peak performance. Background: We can think of the processor pipeline as being divided into 2 broader parts: Front-end and Back-end. Front-end is responsible for fetching the instruction, decoding into uops in machine understandable format and putting them into a uop queue to be consumed by back end. The back-end then takes these uops, allocates the required resources. When all resources are ready, uops are executed. If the back-end is not ready to accept uops from the front-end, then we do not want to count these as front-end bottlenecks. However, whenever we have bottlenecks in the back-end, we will have allocation unit stalls and eventually forcing the front-end to wait until the back-end is ready to receive more uops. This event counts only when back-end is requesting more uops and front-end is not able to provide them. When 3 uops are requested and no uops are delivered, the event counts 3. When 3 are requested, and only 1 is delivered, the event counts 2. When only 2 are delivered, the event counts 1. Alternatively stated, the event will not count if 3 uops are delivered, or if the back end is stalled and not requesting any uops at all. Counts indicate missed opportunities for the front-end to deliver a uop to the back end. Some examples of conditions that cause front-end efficiencies are: ICache misses, ITLB misses, and decoder restrictions that limit the front-end bandwidth. Known Issues: Some uops require multiple allocation slots. These uops will not be charged as a front end 'not delivered' opportunity, and will be regarded as a back end problem. For example, the INC instruction has one uop that requires 2 issue slots. A stream of INC instructions will not count as UOPS_NOT_DELIVERED, even though only one instruction can be issued per clock. The low uop issue rate for a stream of INC instructions is considered to be a back end issue. + name:cpu_clk_unhalted type:exclusive default:core + 0x2 extra: core Counts the number of core cycles while the core is not in a halt state. The core enters the halt state when it is running the HLT instruction. In mobile systems the core frequency may change from time to time. For this reason this event may have a changing ratio with regards to time. This event uses fixed counter 1. You cannot collect a PEBs record for this event. + 0x1 extra: ref_tsc Counts the number of reference cycles that the core is not in a halt state. The core enters the halt state when it is running the HLT instruction. In mobile systems the core frequency may change from time. This event is not affected by core frequency changes but counts as if the core is running at the maximum frequency all the time. This event uses fixed counter 2. You cannot collect a PEBs record for this event +@@ -31,12 +31,14 @@ name:ld_blocks type:exclusive default:all_block + 0x10 extra:pebs all_block_pebs Counts anytime a load that retires is blocked for any reason. + 0x8 extra: utlb_miss Counts loads blocked because they are unable to find their physical address in the micro TLB (UTLB). + 0x8 extra:pebs utlb_miss_pebs Counts loads blocked because they are unable to find their physical address in the micro TLB (UTLB). ++ 0x2 extra: store_forward Counts a load blocked from using a store forward because of an address/size mismatch, only one of the loads blocked from each store will be counted. ++ 0x2 extra:pebs store_forward_pebs Counts a load blocked from using a store forward because of an address/size mismatch, only one of the loads blocked from each store will be counted. + 0x1 extra: data_unknown Counts a load blocked from using a store forward, but did not occur because the store data was not available at the right time. The forward might occur subsequently when the data is available. + 0x1 extra:pebs data_unknown_pebs Counts a load blocked from using a store forward, but did not occur because the store data was not available at the right time. The forward might occur subsequently when the data is available. + 0x4 extra: u4k_alias Counts loads that block because their address modulo 4K matches a pending store. + 0x4 extra:pebs u4k_alias_pebs Counts loads that block because their address modulo 4K matches a pending store. + name:page_walks type:exclusive default:0x1 +- 0x1 extra: d_side_cycles Counts every core cycle when a Data-side walks (due to data operation) page walk is in progress. ++ 0x1 extra: d_side_cycles Counts every core cycle when a Data-side (walks due to a data operation) page walk is in progress. + 0x2 extra: i_side_cycles Counts every core cycle when a Instruction-side (walks due to an instruction fetch) page walk is in progress. + 0x3 extra: cycles Counts every core cycle a page-walk is in progress due to either a data memory operation or an instruction fetch. + name:misalign_mem_ref type:exclusive default:load_page_split +@@ -48,35 +50,31 @@ name:longest_lat_cache type:exclusive default:0x4f + 0x4f extra: reference Counts memory requests originating from the core that reference a cache line in the L2 cache. + 0x41 extra: miss Counts memory requests originating from the core that miss in the L2 cache. + name:icache type:exclusive default:0x1 +- 0x1 extra: hit Counts each cache line access to the Icache that are fulfilled (hit) by the Icache +- 0x2 extra: misses Counts each cache line access to the Icache that are not fullfilled (miss) by the Icache +- 0x3 extra: accesses Counts each cache line access to the Icache ++ 0x1 extra: hit Counts requests to the Instruction Cache (ICache) for one or more bytes in an ICache Line and that cache line is in the ICache (hit). The event strives to count on a cache line basis, so that multiple accesses which hit in a single cache line count as one ICACHE.HIT. Specifically, the event counts when straight line code crosses the cache line boundary, or when a branch target is to a new line, and that cache line is in the ICache. This event counts differently than Intel processors based on Silvermont microarchitecture. ++ 0x2 extra: misses Counts requests to the Instruction Cache (ICache) for one or more bytes in an ICache Line and that cache line is not in the ICache (miss). The event strives to count on a cache line basis, so that multiple accesses which miss in a single cache line count as one ICACHE.MISS. Specifically, the event counts when straight line code crosses the cache line boundary, or when a branch target is to a new line, and that cache line is not in the ICache. This event counts differently than Intel processors based on Silvermont microarchitecture. ++ 0x3 extra: accesses Counts requests to the Instruction Cache (ICache) for one or more bytes in an ICache Line. The event strives to count on a cache line basis, so that multiple fetches to a single cache line count as one ICACHE.ACCESS. Specifically, the event counts when accesses from straight line code crosses the cache line boundary, or when a branch target is to a new line. This event counts differently than Intel processors based on Silvermont microarchitecture. + name:inst_retired type:exclusive default:any +- 0x0 extra: any Counts the number of instructions that retire execution. For instructions that consist of multiple uops, this event counts the retirement of the last uop of the instruction. The counter continues counting during hardware interrupts, traps, and inside interrupt handlers. This event uses fixed counter 0. You cannot collect a PEBs record for this event +- 0x0 extra: any_p Counts the number of instructions that retire execution. For instructions that consist of multiple uops, this event counts the retirement of the last uop of the instruction. The event continues counting during hardware interrupts, traps, and inside interrupt handlers. This is an architectural performance event. This event uses a (_P)rogrammable general purpose performance counter. +- 0x0 extra:pebs any_pebs Counts the number of instructions that retire execution. For instructions that consist of multiple uops, this event counts the retirement of the last uop of the instruction. The event continues counting during hardware interrupts, traps, and inside interrupt handlers. This is an architectural performance event. This event uses a (_P)rogrammable general purpose performance counter. *This event is Precise Event capable: The EventingRIP field in the PEBS record is precise to the address of the instruction which caused the event. Note: Because PEBS records can be collected only on IA32_PMC0, only one event can use the PEBS facility at a time. ++ 0x0 extra: any Counts the number of instructions that retire execution. For instructions that consist of multiple uops, this event counts the retirement of the last uop of the instruction. The counter continues counting during hardware interrupts, traps, and inside interrupt handlers. This event uses fixed counter 0. You cannot collect a PEBs record for this event. ++ 0x0 extra: any_p Counts the number of instructions that retire execution. For instructions that consist of multiple uops, this event counts the retirement of the last uop of the instruction. The event continues counting during hardware interrupts, traps, and inside interrupt handlers. This is an architectural performance event. This event uses a (_P)rogrammable general purpose performance counter. *This event is Precise Event capable: The EventingRIP field in the PEBS record is precise to the address of the instruction which caused the event. Note: Because PEBS records can be collected only on IA32_PMC0, only one event can use the PEBS facility at a time. ++ 0x0 extra:pebs any_p_pebs Counts the number of instructions that retire execution. For instructions that consist of multiple uops, this event counts the retirement of the last uop of the instruction. The event continues counting during hardware interrupts, traps, and inside interrupt handlers. This is an architectural performance event. This event uses a (_P)rogrammable general purpose performance counter. *This event is Precise Event capable: The EventingRIP field in the PEBS record is precise to the address of the instruction which caused the event. Note: Because PEBS records can be collected only on IA32_PMC0, only one event can use the PEBS facility at a time. + name:uops_retired type:exclusive default:any + 0x0 extra: any Counts uops which retired + 0x0 extra:pebs any_pebs Counts uops which retired + 0x1 extra: ms Counts uops retired that are from the complex flows issued by the micro-sequencer (MS). Counts both the uops from a micro-coded instruction, and the uops that might be generated from a micro-coded assist. + 0x1 extra:pebs ms_pebs Counts uops retired that are from the complex flows issued by the micro-sequencer (MS). Counts both the uops from a micro-coded instruction, and the uops that might be generated from a micro-coded assist. +- 0x8 extra: fpdiv Counts the number of floating point divide uops retired. +- 0x8 extra:pebs fpdiv_pebs Counts the number of floating point divide uops retired. +- 0x10 extra: idiv Counts the number of integer divide uops retired. +- 0x10 extra:pebs idiv_pebs Counts the number of integer divide uops retired. + name:machine_clears type:exclusive default:0x0 + 0x0 extra: all Counts machine clears for any reason + 0x1 extra: smc Counts the number of times that the processor detects that a program is writing to a code section and has to perform a machine clear because of that modification. Self-modifying code (SMC) causes a severe penalty in all Intel architecture processors. +- 0x2 extra: memory_ordering Counts machine clears due to memory ordering issues. This occurs when a snoop request happens and the machine is uncertain if memory ordering will be preserved, as another core is in the process of modifying the data. ++ 0x2 extra: memory_ordering Counts machine clears due to memory ordering issues. This occurs when a snoop request happens and the machine is uncertain if memory ordering will be preserved - as another core is in the process of modifying the data. + 0x4 extra: fp_assist Counts machine clears due to floating point (FP) operations needing assists. For instance, if the result was a floating point denormal, the hardware clears the pipeline and reissues uops to produce the correct IEEE compliant denormal result. + 0x8 extra: disambiguation Counts machine clears due to memory disambiguation. Memory disambiguation happens when a load which has been issued conflicts with a previous unretired store in the pipeline whose address was not known at issue time, but is later resolved to be the same as the load address. + name:br_inst_retired type:exclusive default:all_branches + 0x0 extra: all_branches Counts branch instructions retired for all branch types. This is an architectural performance event. + 0x0 extra:pebs all_branches_pebs Counts branch instructions retired for all branch types. This is an architectural performance event. +- 0x7e extra: jcc Counts retired Jcc (Jump on Conditional Code/Jump if Conditon is Met) branch instructions retired, including both when the branch was taken and when it was not taken. +- 0x7e extra:pebs jcc_pebs Counts retired Jcc (Jump on Conditional Code/Jump if Conditon is Met) branch instructions retired, including both when the branch was taken and when it was not taken. +- 0xfe extra: taken_jcc Counts Jcc (Jump on Conditional Code/Jump if Conditon is Met) branch instructions retired that were taken and does not count when the Jcc branch instruction were not taken. +- 0xfe extra:pebs taken_jcc_pebs Counts Jcc (Jump on Conditional Code/Jump if Conditon is Met) branch instructions retired that were taken and does not count when the Jcc branch instruction were not taken. ++ 0x7e extra: jcc Counts retired Jcc (Jump on Conditional Code/Jump if Condition is Met) branch instructions retired, including both when the branch was taken and when it was not taken. ++ 0x7e extra:pebs jcc_pebs Counts retired Jcc (Jump on Conditional Code/Jump if Condition is Met) branch instructions retired, including both when the branch was taken and when it was not taken. ++ 0xfe extra: taken_jcc Counts Jcc (Jump on Conditional Code/Jump if Condition is Met) branch instructions retired that were taken and does not count when the Jcc branch instruction were not taken. ++ 0xfe extra:pebs taken_jcc_pebs Counts Jcc (Jump on Conditional Code/Jump if Condition is Met) branch instructions retired that were taken and does not count when the Jcc branch instruction were not taken. + 0xf9 extra: call Counts near CALL branch instructions retired. + 0xf9 extra:pebs call_pebs Counts near CALL branch instructions retired. + 0xfd extra: rel_call Counts near relative CALL branch instructions retired. +@@ -87,24 +85,24 @@ name:br_inst_retired type:exclusive default:all_branches + 0xf7 extra:pebs return_pebs Counts near return branch instructions retired. + 0xeb extra: non_return_ind Counts near indirect call or near indirect jmp branch instructions retired. + 0xeb extra:pebs non_return_ind_pebs Counts near indirect call or near indirect jmp branch instructions retired. +- 0xbf extra: far_branch Counts far branch instructions retired. This includes far jump, far call and return, and Interrupt call and return. Intel Architecture uses far branches to transition to a different privilege level (ex: kernel/user). +- 0xbf extra:pebs far_branch_pebs Counts far branch instructions retired. This includes far jump, far call and return, and Interrupt call and return. Intel Architecture uses far branches to transition to a different privilege level (ex: kernel/user). ++ 0xbf extra: far_branch Counts far branch instructions retired. This includes far jump, far call and return, and Interrupt call and return. ++ 0xbf extra:pebs far_branch_pebs Counts far branch instructions retired. This includes far jump, far call and return, and Interrupt call and return. + name:br_misp_retired type:exclusive default:all_branches + 0x0 extra: all_branches Counts mispredicted branch instructions retired including all branch types. + 0x0 extra:pebs all_branches_pebs Counts mispredicted branch instructions retired including all branch types. +- 0x7e extra: jcc Counts mispredicted retired Jcc (Jump on Conditional Code/Jump if Conditon is Met) branch instructions retired, including both when the branch was supposed to be taken and when it was not supposed to be taken (but the processor predicted the opposite condition). +- 0x7e extra:pebs jcc_pebs Counts mispredicted retired Jcc (Jump on Conditional Code/Jump if Conditon is Met) branch instructions retired, including both when the branch was supposed to be taken and when it was not supposed to be taken (but the processor predicted the opposite condition). ++ 0x7e extra: jcc Counts mispredicted retired Jcc (Jump on Conditional Code/Jump if Condition is Met) branch instructions retired, including both when the branch was supposed to be taken and when it was not supposed to be taken (but the processor predicted the opposite condition). ++ 0x7e extra:pebs jcc_pebs Counts mispredicted retired Jcc (Jump on Conditional Code/Jump if Condition is Met) branch instructions retired, including both when the branch was supposed to be taken and when it was not supposed to be taken (but the processor predicted the opposite condition). + 0xfe extra: taken_jcc Counts mispredicted retired Jcc (Jump on Conditional Code/Jump if Condition is Met) branch instructions retired that were supposed to be taken but the processor predicted that it would not be taken. + 0xfe extra:pebs taken_jcc_pebs Counts mispredicted retired Jcc (Jump on Conditional Code/Jump if Condition is Met) branch instructions retired that were supposed to be taken but the processor predicted that it would not be taken. +- 0xfb extra: ind_call Counts mispredicted near indirect CALL branch instructions retired, where the target address taken was not what the processor predicted. +- 0xfb extra:pebs ind_call_pebs Counts mispredicted near indirect CALL branch instructions retired, where the target address taken was not what the processor predicted. +- 0xf7 extra: return Counts mispredicted near RET branch instructions retired, where the return address taken was not what the processor predicted. +- 0xf7 extra:pebs return_pebs Counts mispredicted near RET branch instructions retired, where the return address taken was not what the processor predicted. ++ 0xfb extra: ind_call Counts mispredicted near indirect CALL branch instructions retired, where the target address taken was not what the processor predicted. ++ 0xfb extra:pebs ind_call_pebs counts mispredicted near indirect CALL branch instructions retired, where the target address taken was not what the processor predicted. ++ 0xf7 extra: return Counts mispredicted near RET branch instructions retired, where the return address taken was not what the processor predicted. ++ 0xf7 extra:pebs return_pebs Counts mispredicted near RET branch instructions retired, where the return address taken was not what the processor predicted. + 0xeb extra: non_return_ind Counts mispredicted branch instructions retired that were near indirect call or near indirect jmp, where the target address taken was not what the processor predicted. + 0xeb extra:pebs non_return_ind_pebs Counts mispredicted branch instructions retired that were near indirect call or near indirect jmp, where the target address taken was not what the processor predicted. + name:issue_slots_not_consumed type:exclusive default:0x0 + 0x0 extra: any Counts the number of issue slots per core cycle that were not consumed by the backend due to either a full resource in the backend (RESOURCE_FULL) or due to the processor recovering from some event (RECOVERY) +- 0x1 extra: resource_full Counts the number of issue slots per core cycle that were not consumed because of a full resource in the backend. Including but not limited the Re-order Buffer (ROB), reservation stations (RS), load/store buffers, physical registers, or any other needed machine resource that is currently unavailable. Note that uops must be available for consumption in order for this event to fire. If a uop is not available (Instruction Queue is empty), this event will not count. ++ 0x1 extra: resource_full Counts the number of issue slots per core cycle that were not consumed because of a full resource in the backend. Including but not limited to resources such as the Re-order Buffer (ROB), reservation stations (RS), load/store buffers, physical registers, or any other needed machine resource that is currently unavailable. Note that uops must be available for consumption in order for this event to fire. If a uop is not available (Instruction Queue is empty), this event will not count. + 0x2 extra: recovery Counts the number of issue slots per core cycle that were not consumed by the backend because allocation is stalled waiting for a mispredicted jump to retire or other branch-like conditions (e.g. the event is relevant during certain microcode flows). Counts all issue slots blocked while within this window including slots where uops were not available in the Instruction Queue. + name:hw_interrupts type:exclusive default:0x1 + 0x1 extra: received Counts hardware interrupts received by the processor. +@@ -117,8 +115,8 @@ name:mem_uops_retired type:exclusive default:all + 0x83 extra: all Counts the number of memory uops retired that is either a loads or a store or both. + 0x81 extra: all_loads Counts the number of load uops retired + 0x81 extra:pebs all_loads_pebs Counts the number of load uops retired +- 0x82 extra: all_stores Counts the number of store uops retired +- 0x82 extra:pebs all_stores_pebs Counts the number of store uops retired ++ 0x82 extra: all_stores Counts the number of store uops retired. ++ 0x82 extra:pebs all_stores_pebs Counts the number of store uops retired. + 0x83 extra:pebs all_pebs Counts the number of memory uops retired that is either a loads or a store or both. + 0x11 extra: dtlb_miss_loads Counts load uops retired that caused a DTLB miss. + 0x11 extra:pebs dtlb_miss_loads_pebs Counts load uops retired that caused a DTLB miss. +@@ -128,28 +126,28 @@ name:mem_uops_retired type:exclusive default:all + 0x13 extra:pebs dtlb_miss_pebs Counts uops retired that had a DTLB miss on load, store or either. Note that when two distinct memory operations to the same page miss the DTLB, only one of them will be recorded as a DTLB miss. + 0x21 extra: lock_loads Counts locked memory uops retired. This includes "regular" locks and bus locks. (To specifically count bus locks only, see the Offcore response event.) A locked access is one with a lock prefix, or an exchange to memory. See the SDM for a complete description of which memory load accesses are locks. + 0x21 extra:pebs lock_loads_pebs Counts locked memory uops retired. This includes "regular" locks and bus locks. (To specifically count bus locks only, see the Offcore response event.) A locked access is one with a lock prefix, or an exchange to memory. See the SDM for a complete description of which memory load accesses are locks. +- 0x41 extra: split_loads Counts load uops retired where the data requested spans a 64 byte cache line boundry. +- 0x41 extra:pebs split_loads_pebs Counts load uops retired where the data requested spans a 64 byte cache line boundry. +- 0x42 extra: split_stores Counts store uops retired where the data requested spans a 64 byte cache line boundry. +- 0x42 extra:pebs split_stores_pebs Counts store uops retired where the data requested spans a 64 byte cache line boundry. +- 0x43 extra: split Counts memory uops retired where the data requested spans a 64 byte cache line boundry. +- 0x43 extra:pebs split_pebs Counts memory uops retired where the data requested spans a 64 byte cache line boundry. ++ 0x41 extra: split_loads Counts load uops retired where the data requested spans a 64 byte cache line boundary. ++ 0x41 extra:pebs split_loads_pebs Counts load uops retired where the data requested spans a 64 byte cache line boundary. ++ 0x42 extra: split_stores Counts store uops retired where the data requested spans a 64 byte cache line boundary. ++ 0x42 extra:pebs split_stores_pebs Counts store uops retired where the data requested spans a 64 byte cache line boundary. ++ 0x43 extra: split Counts memory uops retired where the data requested spans a 64 byte cache line boundary. ++ 0x43 extra:pebs split_pebs Counts memory uops retired where the data requested spans a 64 byte cache line boundary. + name:mem_load_uops_retired type:exclusive default:l1_hit +- 0x1 extra: l1_hit Counts load uops retired that hit the L1 data cache +- 0x1 extra:pebs l1_hit_pebs Counts load uops retired that hit the L1 data cache +- 0x8 extra: l1_miss Counts load uops retired that miss the L1 data cache +- 0x8 extra:pebs l1_miss_pebs Counts load uops retired that miss the L1 data cache +- 0x2 extra: l2_hit Counts load uops retired that hit in the L2 cache +- 0x2 extra:pebs l2_hit_pebs Counts load uops retired that hit in the L2 cache +- 0x10 extra: l2_miss Counts load uops retired that miss in the L2 cache +- 0x10 extra:pebs l2_miss_pebs Counts load uops retired that miss in the L2 cache ++ 0x1 extra: l1_hit Counts load uops retired that hit the L1 data cache. ++ 0x1 extra:pebs l1_hit_pebs Counts load uops retired that hit the L1 data cache. ++ 0x8 extra: l1_miss Counts load uops retired that miss the L1 data cache. ++ 0x8 extra:pebs l1_miss_pebs Counts load uops retired that miss the L1 data cache. ++ 0x2 extra: l2_hit Counts load uops retired that hit in the L2 cache. ++ 0x2 extra:pebs l2_hit_pebs Counts load uops retired that hit in the L2 cache. ++ 0x10 extra: l2_miss Counts load uops retired that miss in the L2 cache. ++ 0x10 extra:pebs l2_miss_pebs Counts load uops retired that miss in the L2 cache. + 0x20 extra: hitm Counts load uops retired where the cache line containing the data was in the modified state of another core or modules cache (HITM). More specifically, this means that when the load address was checked by other caching agents (typically another processor) in the system, one of those caching agents indicated that they had a dirty copy of the data. Loads that obtain a HITM response incur greater latency than most is typical for a load. In addition, since HITM indicates that some other processor had this data in its cache, it implies that the data was shared between processors, or potentially was a lock or semaphore value. This event is useful for locating sharing, false sharing, and contended locks. + 0x20 extra:pebs hitm_pebs Counts load uops retired where the cache line containing the data was in the modified state of another core or modules cache (HITM). More specifically, this means that when the load address was checked by other caching agents (typically another processor) in the system, one of those caching agents indicated that they had a dirty copy of the data. Loads that obtain a HITM response incur greater latency than most is typical for a load. In addition, since HITM indicates that some other processor had this data in its cache, it implies that the data was shared between processors, or potentially was a lock or semaphore value. This event is useful for locating sharing, false sharing, and contended locks. +- 0x40 extra: wcb_hit Counts memory load uops retired where the data is retrieved from the WCB (or fill buffer), indicating that the load found its data while that data was in the process of being brought into the L1 cache. Typically a load will receive this indication when some other load or prefetch missed the L1 cache and was in the process of retrieving the cache line containing the data , but that process had not yet finished (and written the data back to the cache). For example, consider load X and Y, both referencing the same cache line that is not in the L1 cache. If load X misses cache first, it obtains and WCB (or fill buffer) and begins the process of requesting the data. When load Y requests the data, it will either hit the WCB, or the L1 cache, depending on exactly what time the request to Y occurs. +- 0x40 extra:pebs wcb_hit_pebs Counts memory load uops retired where the data is retrieved from the WCB (or fill buffer), indicating that the load found its data while that data was in the process of being brought into the L1 cache. Typically a load will receive this indication when some other load or prefetch missed the L1 cache and was in the process of retrieving the cache line containing the data , but that process had not yet finished (and written the data back to the cache). For example, consider load X and Y, both referencing the same cache line that is not in the L1 cache. If load X misses cache first, it obtains and WCB (or fill buffer) and begins the process of requesting the data. When load Y requests the data, it will either hit the WCB, or the L1 cache, depending on exactly what time the request to Y occurs. +- 0x80 extra: dram_hit Counts memory load uops retired where the data is retrieved from DRAM. Event is counted at retirment, so the speculative loads are ignored. A memory load can hit (or miss) the L1 cache, hit (or miss) the L2 cache, hit DRAM, hit in the WCB or receive a HITM response. +- 0x80 extra:pebs dram_hit_pebs Counts memory load uops retired where the data is retrieved from DRAM. Event is counted at retirment, so the speculative loads are ignored. A memory load can hit (or miss) the L1 cache, hit (or miss) the L2 cache, hit DRAM, hit in the WCB or receive a HITM response. ++ 0x40 extra: wcb_hit Counts memory load uops retired where the data is retrieved from the WCB (or fill buffer), indicating that the load found its data while that data was in the process of being brought into the L1 cache. Typically a load will receive this indication when some other load or prefetch missed the L1 cache and was in the process of retrieving the cache line containing the data, but that process had not yet finished (and written the data back to the cache). For example, consider load X and Y, both referencing the same cache line that is not in the L1 cache. If load X misses cache first, it obtains and WCB (or fill buffer) and begins the process of requesting the data. When load Y requests the data, it will either hit the WCB, or the L1 cache, depending on exactly what time the request to Y occurs. ++ 0x40 extra:pebs wcb_hit_pebs Counts memory load uops retired where the data is retrieved from the WCB (or fill buffer), indicating that the load found its data while that data was in the process of being brought into the L1 cache. Typically a load will receive this indication when some other load or prefetch missed the L1 cache and was in the process of retrieving the cache line containing the data, but that process had not yet finished (and written the data back to the cache). For example, consider load X and Y, both referencing the same cache line that is not in the L1 cache. If load X misses cache first, it obtains and WCB (or fill buffer) and begins the process of requesting the data. When load Y requests the data, it will either hit the WCB, or the L1 cache, depending on exactly what time the request to Y occurs. ++ 0x80 extra: dram_hit Counts memory load uops retired where the data is retrieved from DRAM. Event is counted at retirement, so the speculative loads are ignored. A memory load can hit (or miss) the L1 cache, hit (or miss) the L2 cache, hit DRAM, hit in the WCB or receive a HITM response. ++ 0x80 extra:pebs dram_hit_pebs Counts memory load uops retired where the data is retrieved from DRAM. Event is counted at retirement, so the speculative loads are ignored. A memory load can hit (or miss) the L1 cache, hit (or miss) the L2 cache, hit DRAM, hit in the WCB or receive a HITM response. + name:baclears type:exclusive default:0x1 + 0x1 extra: all Counts the number of times a BACLEAR is signaled for any reason, including, but not limited to indirect branch/call, Jcc (Jump on Conditional Code/Jump if Condition is Met) branch, unconditional branch/call, and returns. + 0x8 extra: return Counts BACLEARS on return instructions. +- 0x10 extra: cond Counts BACLEARS on Jcc (Jump on Conditional Code/Jump if Conditon is Met) branches. ++ 0x10 extra: cond Counts BACLEARS on Jcc (Jump on Conditional Code/Jump if Condition is Met) branches. +From b3c20ae8b52c10aa631ca0b931388df98ca3183d Mon Sep 17 00:00:00 2001 +From: Michael Petlan +Date: Fri, 23 Sep 2016 13:35:54 +0200 +Subject: [PATCH] Intel Goldmont default event + +Hi all, + +when testing oprofile on an Intel Goldmont machine, I have found out +that the default event cpu_clk_unhalted returns always zero. Thus, I +checked the configuration and Intel SDM, and I think there must be a +mistake. + +According to the Intel SDM, table 19-24, the event is 0x3c as usual. +It has two unit masks (0x00 (core_p) and 0x01 (ref)). With this, the +event starts giving reasonable results. + +The current configuration which is coded in oprofile is not even in +the SDM tale 19-24, so it is expectable that the following will give +zero value: + +perf stat -e cpu/event=0x00,umask=0x02/ ls + +Please consider applying the attached patch. + +CC'ing Andi to verify the fix. + +Thank you, +Michael + +commit df73e385442236fd6e763cc192185c606e59feda +Author: Michael Petlan +Date: Fri Sep 23 13:16:00 2016 +0200 + + Fixed default event on Intel Goldmont + + According to the Intel SDM, table 19-24, the event cpu_clk_unhalted + has the event number 0x3c and has two unit masks (0x00, 0x01). This + also corresponds to other Intels where the event is also 0x3c. + + Tested on a Goldmont Harrisonville (model 95). + + Before the patch: + + $ ocount ls + Events were actively counted for 1761229 nanoseconds. + Event counts (actual) for /usr/bin/ls: + Event Count % time counted + cpu_clk_unhalted 0 100.00 + + After the patch: + + Event counts (actual) for /usr/bin/ls: + Event Count % time counted + cpu_clk_unhalted 2,948,142 100.00 + + Signed-off-by: Michael Petlan +--- + events/i386/goldmont/events | 2 +- + events/i386/goldmont/unit_masks | 4 +--- + 2 files changed, 2 insertions(+), 4 deletions(-) + +diff --git a/events/i386/goldmont/events b/events/i386/goldmont/events +index 111438e..89cbc59 100644 +--- a/events/i386/goldmont/events ++++ b/events/i386/goldmont/events +@@ -6,7 +6,7 @@ + # Note the minimum counts are not discovered experimentally and could be likely + # lowered in many cases without ill effect. + # +-event:0x00 counters:cpuid um:cpu_clk_unhalted minimum:2000003 name:cpu_clk_unhalted : ++event:0x3c counters:cpuid um:cpu_clk_unhalted minimum:2000003 name:cpu_clk_unhalted : + event:0x03 counters:cpuid um:ld_blocks minimum:200003 name:ld_blocks : + event:0x05 counters:cpuid um:page_walks minimum:200003 name:page_walks : + event:0x0e counters:cpuid um:uops_issued minimum:200003 name:uops_issued_any : +diff --git a/events/i386/goldmont/unit_masks b/events/i386/goldmont/unit_masks +index d1c08d4..9d93da0 100644 +--- a/events/i386/goldmont/unit_masks ++++ b/events/i386/goldmont/unit_masks +@@ -21,9 +21,7 @@ name:uops_issued type:mandatory default:0x0 + 0x0 extra: any Counts uops issued by the front end and allocated into the back end of the machine. This event counts uops that retire as well as uops that were speculatively executed but didn't retire. The sort of speculative uops that might be counted includes, but is not limited to those uops issued in the shadow of a miss-predicted branch, those uops that are inserted during an assist (such as for a denormal floating point result), and (previously allocated) uops that might be canceled during a machine clear. + name:uops_not_delivered type:mandatory default:0x0 + 0x0 extra: any This event used to measure front-end inefficiencies. I.e. when front-end of the machine is not delivering uops to the back-end and the back-end has is not stalled. This event can be used to identify if the machine is truly front-end bound. When this event occurs, it is an indication that the front-end of the machine is operating at less than its theoretical peak performance. Background: We can think of the processor pipeline as being divided into 2 broader parts: Front-end and Back-end. Front-end is responsible for fetching the instruction, decoding into uops in machine understandable format and putting them into a uop queue to be consumed by back end. The back-end then takes these uops, allocates the required resources. When all resources are ready, uops are executed. If the back-end is not ready to accept uops from the front-end, then we do not want to count these as front-end bottlenecks. However, whenever we have bottlenecks in the back-end, we will have allocation unit stalls and eventually forcing the front-end to wait until the back-end is ready to receive more uops. This event counts only when back-end is requesting more uops and front-end is not able to provide them. When 3 uops are requested and no uops are delivered, the event counts 3. When 3 are requested, and only 1 is delivered, the event counts 2. When only 2 are delivered, the event counts 1. Alternatively stated, the event will not count if 3 uops are delivered, or if the back end is stalled and not requesting any uops at all. Counts indicate missed opportunities for the front-end to deliver a uop to the back end. Some examples of conditions that cause front-end efficiencies are: ICache misses, ITLB misses, and decoder restrictions that limit the front-end bandwidth. Known Issues: Some uops require multiple allocation slots. These uops will not be charged as a front end 'not delivered' opportunity, and will be regarded as a back end problem. For example, the INC instruction has one uop that requires 2 issue slots. A stream of INC instructions will not count as UOPS_NOT_DELIVERED, even though only one instruction can be issued per clock. The low uop issue rate for a stream of INC instructions is considered to be a back end issue. +-name:cpu_clk_unhalted type:exclusive default:core +- 0x2 extra: core Counts the number of core cycles while the core is not in a halt state. The core enters the halt state when it is running the HLT instruction. In mobile systems the core frequency may change from time to time. For this reason this event may have a changing ratio with regards to time. This event uses fixed counter 1. You cannot collect a PEBs record for this event. +- 0x1 extra: ref_tsc Counts the number of reference cycles that the core is not in a halt state. The core enters the halt state when it is running the HLT instruction. In mobile systems the core frequency may change from time. This event is not affected by core frequency changes but counts as if the core is running at the maximum frequency all the time. This event uses fixed counter 2. You cannot collect a PEBs record for this event ++name:cpu_clk_unhalted type:exclusive default:core_p + 0x0 extra: core_p Core cycles when core is not halted. This event uses a (_P)rogrammable general purpose performance counter. + 0x1 extra: ref Reference cycles when core is not halted. This event uses a (_P)rogrammable general purpose performance counter. + name:ld_blocks type:exclusive default:all_block +-- +2.7.4 + diff --git a/SOURCES/oprofile-haswell.patch b/SOURCES/oprofile-haswell.patch new file mode 100644 index 0000000..6c02359 --- /dev/null +++ b/SOURCES/oprofile-haswell.patch @@ -0,0 +1,570 @@ +commit 5f11ddb982931f754d3319a64313cf880424ea73 +Author: Andi Kleen +Date: Thu Jul 17 16:23:38 2014 -0500 + + Update the Haswell events to the latest version + + Some minor changes to the previous version, but it should be more + consistent with other tools now. + + The event name descriptions have been dropped. They were never all that + useful anyways because the event is defined by the unit masks. + Now all events with more than one unit mask only have a description + in the unit masks. + + As a new feature any known Errata to the event are referenced. + + Signed-off-by: Andi Kleen + +diff --git a/events/i386/haswell/events b/events/i386/haswell/events +index 51fcd50..5aa5eb5 100644 +--- a/events/i386/haswell/events ++++ b/events/i386/haswell/events +@@ -7,54 +7,58 @@ + # lowered in many cases without ill effect. + # + include:i386/arch_perfmon +-event:0x03 counters:cpuid um:x02 minimum:100003 name:ld_blocks_store_forward : Cases when loads get true Block-on-Store blocking code preventing store forwarding +-event:0x05 counters:cpuid um:misalign_mem_ref minimum:2000003 name:misalign_mem_ref : misalign_mem_ref +-event:0x07 counters:cpuid um:one minimum:100003 name:ld_blocks_partial_address_alias : False dependencies in MOB due to partial address comparison +-event:0x08 counters:cpuid um:dtlb_load_misses minimum:2000003 name:dtlb_load_misses : dtlb_load_misses +-event:0x0d counters:cpuid um:x03 minimum:2000003 name:int_misc_recovery_cycles : Number of cycles waiting for the checkpoints in Resource Allocation Table (RAT) to be recovered after Nuke due to all other cases except JEClear (e.g. whenever a ucode assist is needed like SSE exception, memory disambiguation, etc...) +-event:0x0e counters:cpuid um:uops_issued minimum:2000003 name:uops_issued : uops_issued +-event:0x24 counters:cpuid um:l2_rqsts minimum:200003 name:l2_rqsts : l2_rqsts +-event:0x27 counters:cpuid um:x50 minimum:200003 name:l2_demand_rqsts_wb_hit : Not rejected writebacks that hit L2 cache +-event:0x48 counters:2 um:l1d_pend_miss minimum:2000003 name:l1d_pend_miss : l1d_pend_miss +-event:0x49 counters:cpuid um:dtlb_store_misses minimum:100003 name:dtlb_store_misses : dtlb_store_misses +-event:0x4c counters:cpuid um:load_hit_pre minimum:100003 name:load_hit_pre : load_hit_pre +-event:0x51 counters:cpuid um:one minimum:2000003 name:l1d_replacement : L1D data line replacements +-event:0x54 counters:cpuid um:tx_mem minimum:2000003 name:tx_mem : tx_mem +-event:0x58 counters:cpuid um:move_elimination minimum:1000003 name:move_elimination : move_elimination +-event:0x5c counters:cpuid um:cpl_cycles minimum:2000003 name:cpl_cycles : cpl_cycles +-event:0x5d counters:cpuid um:tx_exec minimum:2000003 name:tx_exec : tx_exec +-event:0x5e counters:cpuid um:one minimum:2000003 name:rs_events_empty_cycles : Cycles when Reservation Station (RS) is empty for the thread +-event:0x63 counters:cpuid um:lock_cycles minimum:2000003 name:lock_cycles : lock_cycles +-event:0x79 counters:0,1,2,3 um:idq minimum:2000003 name:idq : idq +-event:0x80 counters:cpuid um:x02 minimum:200003 name:icache_misses : Number of Instruction Cache, Streaming Buffer and Victim Cache Misses. Includes Uncacheable accesses. +-event:0x85 counters:cpuid um:itlb_misses minimum:100003 name:itlb_misses : itlb_misses +-event:0x87 counters:cpuid um:ild_stall minimum:2000003 name:ild_stall : ild_stall +-event:0x88 counters:cpuid um:br_inst_exec minimum:200003 name:br_inst_exec : br_inst_exec +-event:0x89 counters:cpuid um:br_misp_exec minimum:200003 name:br_misp_exec : br_misp_exec +-event:0x9c counters:0,1,2,3 um:idq_uops_not_delivered minimum:2000003 name:idq_uops_not_delivered : idq_uops_not_delivered +-event:0xa1 counters:cpuid um:uops_executed_port minimum:2000003 name:uops_executed_port : uops_executed_port +-event:0xa2 counters:cpuid um:resource_stalls minimum:2000003 name:resource_stalls : resource_stalls +-event:0xa3 counters:2 um:cycle_activity minimum:2000003 name:cycle_activity : cycle_activity +-event:0xae counters:cpuid um:one minimum:100007 name:itlb_itlb_flush : Flushing of the Instruction TLB (ITLB) pages, includes 4k/2M/4M pages. +-event:0xb0 counters:cpuid um:offcore_requests minimum:100003 name:offcore_requests : offcore_requests +-event:0xb1 counters:cpuid um:uops_executed minimum:2000003 name:uops_executed : uops_executed +-event:0xbc counters:0,1,2,3 um:page_walker_loads minimum:2000003 name:page_walker_loads : page_walker_loads +-event:0xbd counters:cpuid um:tlb_flush minimum:100007 name:tlb_flush : tlb_flush +-event:0xc0 counters:1 um:one minimum:2000003 name:inst_retired_prec_dist : Precise instruction retired event with HW to reduce effect of PEBS shadow in IP distribution +-event:0xc1 counters:cpuid um:other_assists minimum:100003 name:other_assists : other_assists +-event:0xc2 counters:cpuid um:uops_retired minimum:2000003 name:uops_retired : uops_retired +-event:0xc3 counters:cpuid um:machine_clears minimum:100003 name:machine_clears : machine_clears +-event:0xc4 counters:cpuid um:br_inst_retired minimum:400009 name:br_inst_retired : br_inst_retired +-event:0xc5 counters:cpuid um:br_misp_retired minimum:400009 name:br_misp_retired : br_misp_retired +-event:0xc8 counters:cpuid um:hle_retired minimum:2000003 name:hle_retired : hle_retired +-event:0xc9 counters:cpuid um:rtm_retired minimum:2000003 name:rtm_retired : rtm_retired +-event:0xca counters:cpuid um:fp_assist minimum:100003 name:fp_assist : fp_assist +-event:0xcc counters:cpuid um:x20 minimum:2000003 name:rob_misc_events_lbr_inserts : Count cases of saving new LBR +-event:0xd0 counters:0,1,2,3 um:mem_uops_retired minimum:2000003 name:mem_uops_retired : mem_uops_retired +-event:0xd1 counters:0,1,2,3 um:mem_load_uops_retired minimum:2000003 name:mem_load_uops_retired : mem_load_uops_retired +-event:0xd2 counters:0,1,2,3 um:mem_load_uops_l3_hit_retired minimum:100003 name:mem_load_uops_l3_hit_retired : mem_load_uops_l3_hit_retired +-event:0xd3 counters:0,1,2,3 um:one minimum:100007 name:mem_load_uops_l3_miss_retired_local_dram : Data from local DRAM either Snoop not needed or Snoop Miss (RspI) +-event:0xe6 counters:cpuid um:x1f minimum:100003 name:baclears_any : Counts the total number when the front end is resteered, mainly when the BPU cannot provide a correct prediction and this is corrected by other branch handling mechanisms at the front end. +-event:0xf0 counters:cpuid um:l2_trans minimum:200003 name:l2_trans : l2_trans +-event:0xf1 counters:cpuid um:l2_lines_in minimum:100003 name:l2_lines_in : l2_lines_in +-event:0xf2 counters:cpuid um:l2_lines_out minimum:100003 name:l2_lines_out : l2_lines_out ++event:0x03 counters:cpuid um:ld_blocks minimum:100003 name:ld_blocks : ++event:0x05 counters:cpuid um:misalign_mem_ref minimum:2000003 name:misalign_mem_ref : ++event:0x07 counters:cpuid um:one minimum:100003 name:ld_blocks_partial_address_alias : ++event:0x08 counters:cpuid um:dtlb_load_misses minimum:2000003 name:dtlb_load_misses : ++event:0x0d counters:cpuid um:x03 minimum:2000003 name:int_misc_recovery_cycles : ++event:0x0e counters:cpuid um:uops_issued minimum:2000003 name:uops_issued : ++event:0x24 counters:cpuid um:l2_rqsts minimum:200003 name:l2_rqsts : ++event:0x27 counters:cpuid um:x50 minimum:200003 name:l2_demand_rqsts_wb_hit : ++event:0x48 counters:2 um:l1d_pend_miss minimum:2000003 name:l1d_pend_miss : ++event:0x49 counters:cpuid um:dtlb_store_misses minimum:100003 name:dtlb_store_misses : ++event:0x4c counters:cpuid um:load_hit_pre minimum:100003 name:load_hit_pre : ++event:0x4f counters:cpuid um:x10 minimum:2000003 name:ept_walk_cycles : ++event:0x51 counters:cpuid um:one minimum:2000003 name:l1d_replacement : ++event:0x54 counters:cpuid um:tx_mem minimum:2000003 name:tx_mem : ++event:0x58 counters:cpuid um:move_elimination minimum:1000003 name:move_elimination : ++event:0x5c counters:cpuid um:cpl_cycles minimum:2000003 name:cpl_cycles : ++event:0x5d counters:cpuid um:tx_exec minimum:2000003 name:tx_exec : ++event:0x5e counters:cpuid um:rs_events minimum:2000003 name:rs_events : ++event:0x60 counters:cpuid um:offcore_requests_outstanding minimum:2000003 name:offcore_requests_outstanding : ++event:0x63 counters:cpuid um:lock_cycles minimum:2000003 name:lock_cycles : ++event:0x79 counters:0,1,2,3 um:idq minimum:2000003 name:idq : ++event:0x80 counters:cpuid um:icache minimum:2000003 name:icache : ++event:0x85 counters:cpuid um:itlb_misses minimum:100003 name:itlb_misses : ++event:0x87 counters:cpuid um:ild_stall minimum:2000003 name:ild_stall : ++event:0x88 counters:cpuid um:br_inst_exec minimum:200003 name:br_inst_exec : ++event:0x89 counters:cpuid um:br_misp_exec minimum:200003 name:br_misp_exec : ++event:0x9c counters:0,1,2,3 um:idq_uops_not_delivered minimum:2000003 name:idq_uops_not_delivered : ++event:0xa1 counters:cpuid um:uops_executed_port minimum:2000003 name:uops_executed_port : ++event:0xa2 counters:cpuid um:resource_stalls minimum:2000003 name:resource_stalls : ++event:0xa3 counters:2 um:cycle_activity minimum:2000003 name:cycle_activity : ++event:0xa8 counters:cpuid um:one minimum:2000003 name:lsd_uops : ++event:0xab counters:cpuid um:x02 minimum:2000003 name:dsb2mite_switches_penalty_cycles : ++event:0xae counters:cpuid um:one minimum:100007 name:itlb_itlb_flush : ++event:0xb0 counters:cpuid um:offcore_requests minimum:100003 name:offcore_requests : ++event:0xb1 counters:cpuid um:uops_executed minimum:2000003 name:uops_executed : ++event:0xbc counters:0,1,2,3 um:page_walker_loads minimum:2000003 name:page_walker_loads : ++event:0xbd counters:cpuid um:tlb_flush minimum:100007 name:tlb_flush : ++event:0xc0 counters:1 um:one minimum:2000003 name:inst_retired_prec_dist : ++event:0xc1 counters:cpuid um:other_assists minimum:100003 name:other_assists : ++event:0xc2 counters:cpuid um:uops_retired minimum:2000003 name:uops_retired : ++event:0xc3 counters:cpuid um:machine_clears minimum:2000003 name:machine_clears : ++event:0xc4 counters:cpuid um:br_inst_retired minimum:400009 name:br_inst_retired : ++event:0xc5 counters:cpuid um:br_misp_retired minimum:400009 name:br_misp_retired : ++event:0xc8 counters:cpuid um:hle_retired minimum:2000003 name:hle_retired : ++event:0xc9 counters:0,1,2,3 um:rtm_retired minimum:2000003 name:rtm_retired : ++event:0xca counters:cpuid um:fp_assist minimum:100003 name:fp_assist : ++event:0xcc counters:cpuid um:x20 minimum:2000003 name:rob_misc_events_lbr_inserts : ++event:0xd0 counters:0,1,2,3 um:mem_uops_retired minimum:2000003 name:mem_uops_retired : ++event:0xd1 counters:0,1,2,3 um:mem_load_uops_retired minimum:2000003 name:mem_load_uops_retired : ++event:0xd2 counters:0,1,2,3 um:mem_load_uops_l3_hit_retired minimum:100003 name:mem_load_uops_l3_hit_retired : ++event:0xd3 counters:0,1,2,3 um:mem_load_uops_l3_miss_retired minimum:100007 name:mem_load_uops_l3_miss_retired : ++event:0xe6 counters:cpuid um:x1f minimum:100003 name:baclears_any : ++event:0xf0 counters:cpuid um:l2_trans minimum:200003 name:l2_trans : ++event:0xf1 counters:cpuid um:l2_lines_in minimum:100003 name:l2_lines_in : ++event:0xf2 counters:cpuid um:l2_lines_out minimum:100003 name:l2_lines_out : +diff --git a/events/i386/haswell/unit_masks b/events/i386/haswell/unit_masks +index 32e1c1e..60c2a61 100644 +--- a/events/i386/haswell/unit_masks ++++ b/events/i386/haswell/unit_masks +@@ -8,27 +8,32 @@ name:x02 type:mandatory default:0x2 + 0x2 No unit mask + name:x03 type:mandatory default:0x3 + 0x3 No unit mask ++name:x10 type:mandatory default:0x10 ++ 0x10 No unit mask + name:x1f type:mandatory default:0x1f + 0x1f No unit mask + name:x20 type:mandatory default:0x20 + 0x20 No unit mask + name:x50 type:mandatory default:0x50 + 0x50 No unit mask ++name:ld_blocks type:exclusive default:0x2 ++ 0x2 extra: store_forward This event counts loads that followed a store to the same address, where the data could not be forwarded inside the pipeline from the store to the load. The most common reason why store forwarding would be blocked is when a load's address range overlaps with a preceding smaller uncompleted store. The penalty for blocked store forwarding is that the load must wait for the store to write its value to the cache before it can be issued. ++ 0x8 extra: no_sr The number of times that split load operations are temporarily blocked because all resources for handling the split accesses are in use + name:misalign_mem_ref type:exclusive default:0x1 + 0x1 extra: loads Speculative cache line split load uops dispatched to L1 cache + 0x2 extra: stores Speculative cache line split STA uops dispatched to L1 cache + name:dtlb_load_misses type:exclusive default:0x1 + 0x1 extra: miss_causes_a_walk Load misses in all DTLB levels that cause page walks +- 0xe extra: walk_completed Demand load Miss in all translation lookaside buffer (TLB) levels causes a page walk that completes of any page size. + 0x2 extra: walk_completed_4k Demand load Miss in all translation lookaside buffer (TLB) levels causes a page walk that completes (4K). + 0x4 extra: walk_completed_2m_4m Demand load Miss in all translation lookaside buffer (TLB) levels causes a page walk that completes (2M/4M). +- 0x10 extra: walk_duration Cycles when PMH is busy with page walks +- 0x60 extra: stlb_hit Load operations that miss the first DTLB level but hit the second and do not cause page walks +- 0x20 extra: stlb_hit_4k Load misses that miss the DTLB and hit the STLB (4K) +- 0x40 extra: stlb_hit_2m Load misses that miss the DTLB and hit the STLB (2M) ++ 0x10 extra: walk_duration This event counts cycles when the page miss handler (PMH) is servicing page walks caused by DTLB load misses. ++ 0x20 extra: stlb_hit_4k This event counts load operations from a 4K page that miss the first DTLB level but hit the second and do not cause page walks. ++ 0x40 extra: stlb_hit_2m This event counts load operations from a 2M page that miss the first DTLB level but hit the second and do not cause page walks. + 0x80 extra: pde_cache_miss DTLB demand load misses with low part of linear-to-physical address translation missed +-name:uops_issued type:exclusive default:any +- 0x1 extra: any Uops that Resource Allocation Table (RAT) issues to Reservation Station (RS) ++ 0xe extra: walk_completed Demand load Miss in all translation lookaside buffer (TLB) levels causes a page walk that completes of any page size. ++ 0x60 extra: stlb_hit Load operations that miss the first DTLB level but hit the second and do not cause page walks ++name:uops_issued type:exclusive default:0x1 ++ 0x1 extra: any This event counts the number of uops issued by the Front-end of the pipeline to the Back-end. This event is counted at the allocation stage and will count both retired and non-retired uops. + 0x10 extra: flags_merge Number of flags-merge uops being allocated. Such uops considered perf sensitive; added by GSR u-arch. + 0x20 extra: slow_lea Number of slow LEA uops being allocated. A uop is generally considered SlowLea if it has 3 sources (e.g. 2 sources + immediate) regardless if as a result of LEA instruction or not. + 0x40 extra: single_mul Number of Multiply packed/scalar single precision uops allocated +@@ -47,49 +52,59 @@ name:l2_rqsts type:exclusive default:0x21 + 0x22 extra: rfo_miss RFO requests that miss L2 cache + 0x44 extra: code_rd_hit L2 cache hits when fetching instructions, code reads. + 0x24 extra: code_rd_miss L2 cache misses when fetching instructions +- 0x27 extra: all_demand_miss Demand requests that miss L2 cache +- 0xe7 extra: all_demand_references Demand requests to L2 cache +- 0x3f extra: miss All requests that miss L2 cache +- 0xff extra: references All L2 requests +-name:l1d_pend_miss type:exclusive default:pending ++ 0x27 extra: all_demand_miss Demand requests that miss L2 cache ++ 0xe7 extra: all_demand_references Demand requests to L2 cache ++ 0x3f extra: miss All requests that miss L2 cache ++ 0xff extra: references All L2 requests ++name:l1d_pend_miss type:exclusive default:0x1 + 0x1 extra: pending L1D miss oustandings duration in cycles + 0x1 extra:cmask=1 pending_cycles Cycles with L1D load Misses outstanding. +- 0x1 extra:cmask=1,edge occurences This event counts the number of L1D misses outstanding, using an edge detect to count transitions. + name:dtlb_store_misses type:exclusive default:0x1 + 0x1 extra: miss_causes_a_walk Store misses in all DTLB levels that cause page walks +- 0xe extra: walk_completed Store misses in all DTLB levels that cause completed page walks +- 0x2 extra: walk_completed_4k Store miss in all TLB levels causes a page walk that completes. (4K) ++ 0x2 extra: walk_completed_4k Store miss in all TLB levels causes a page walk that completes. (4K) + 0x4 extra: walk_completed_2m_4m Store misses in all DTLB levels that cause completed page walks (2M/4M) +- 0x10 extra: walk_duration Cycles when PMH is busy with page walks +- 0x60 extra: stlb_hit Store operations that miss the first TLB level but hit the second and do not cause page walks +- 0x20 extra: stlb_hit_4k Store misses that miss the DTLB and hit the STLB (4K) +- 0x40 extra: stlb_hit_2m Store misses that miss the DTLB and hit the STLB (2M) ++ 0x10 extra: walk_duration This event counts cycles when the page miss handler (PMH) is servicing page walks caused by DTLB store misses. ++ 0x20 extra: stlb_hit_4k This event counts store operations from a 4K page that miss the first DTLB level but hit the second and do not cause page walks. ++ 0x40 extra: stlb_hit_2m This event counts store operations from a 2M page that miss the first DTLB level but hit the second and do not cause page walks. + 0x80 extra: pde_cache_miss DTLB store misses with low part of linear-to-physical address translation missed ++ 0xe extra: walk_completed Store misses in all DTLB levels that cause completed page walks ++ 0x60 extra: stlb_hit Store operations that miss the first TLB level but hit the second and do not cause page walks + name:load_hit_pre type:exclusive default:0x1 + 0x1 extra: sw_pf Not software-prefetch load dispatches that hit FB allocated for software prefetch + 0x2 extra: hw_pf Not software-prefetch load dispatches that hit FB allocated for hardware prefetch + name:tx_mem type:exclusive default:0x1 + 0x1 extra: abort_conflict Number of times a transactional abort was signaled due to a data conflict on a transactionally accessed address +- 0x2 extra: abort_capacity Number of times a transactional abort was signaled due to a data capacity limitation ++ 0x2 extra: abort_capacity_write Number of times a transactional abort was signaled due to a data capacity limitation for transactional writes. + 0x4 extra: abort_hle_store_to_elided_lock Number of times a HLE transactional region aborted due to a non XRELEASE prefixed instruction writing to an elided lock in the elision buffer + 0x8 extra: abort_hle_elision_buffer_not_empty Number of times an HLE transactional execution aborted due to NoAllocatedElisionBuffer being non-zero. +- 0x10 extra: abort_hle_elision_buffer_mismatch Number of times an HLE transactional execution aborted due to XRELEASE lock not satisfying the address and value requirements in the elision buffer. ++ 0x10 extra: abort_hle_elision_buffer_mismatch Number of times an HLE transactional execution aborted due to XRELEASE lock not satisfying the address and value requirements in the elision buffer + 0x20 extra: abort_hle_elision_buffer_unsupported_alignment Number of times an HLE transactional execution aborted due to an unsupported read alignment from the elision buffer. +- 0x40 extra: abort_hle_elision_buffer_full Number of times HLE lock could not be elided due to ElisionBufferAvailable being zero. ++ 0x40 extra: hle_elision_buffer_full Number of times HLE lock could not be elided due to ElisionBufferAvailable being zero. + name:move_elimination type:exclusive default:0x1 + 0x1 extra: int_eliminated Number of integer Move Elimination candidate uops that were eliminated. + 0x2 extra: simd_eliminated Number of SIMD Move Elimination candidate uops that were eliminated. + 0x4 extra: int_not_eliminated Number of integer Move Elimination candidate uops that were not eliminated. + 0x8 extra: simd_not_eliminated Number of SIMD Move Elimination candidate uops that were not eliminated. +-name:cpl_cycles type:exclusive default:ring0 ++name:cpl_cycles type:exclusive default:0x1 + 0x1 extra: ring0 Unhalted core cycles when the thread is in ring 0 + 0x2 extra: ring123 Unhalted core cycles when thread is in rings 1, 2, or 3 + 0x1 extra:cmask=1,edge ring0_trans Number of intervals between processor halts while thread is in ring 0 + name:tx_exec type:exclusive default:0x1 +- 0x1 extra: misc1 Counts the number of times a class of instructions that may cause a transactional abort was executed. Since this is the count of execution it may not always cause a transactional abort. +- 0x2 extra: misc2 Counts the number of times a class of instructions that may cause a transactional abort was executed inside a transactional region +- 0x4 extra: misc3 Counts the number of times an instruction execution caused the nest count supported to be exceeded +- 0x8 extra: misc4 Counts the number of times an HLE XACQUIRE instruction was executed inside an RTM transactional region ++ 0x1 extra: misc1 Counts the number of times a class of instructions that may cause a transactional abort was executed. Since this is the count of execution, it may not always cause a transactional abort. ++ 0x2 extra: misc2 Counts the number of times a class of instructions (e.g., vzeroupper) that may cause a transactional abort was executed inside a transactional region ++ 0x4 extra: misc3 Counts the number of times an instruction execution caused the transactional nest count supported to be exceeded ++ 0x8 extra: misc4 Counts the number of times a XBEGIN instruction was executed inside an HLE transactional region. ++ 0x10 extra: misc5 Counts the number of times an HLE XACQUIRE instruction was executed inside an RTM transactional region ++name:rs_events type:exclusive default:0x1 ++ 0x1 extra: empty_cycles This event counts cycles when the Reservation Station ( RS ) is empty for the thread. The RS is a structure that buffers allocated micro-ops from the Front-end. If there are many cycles when the RS is empty, it may represent an underflow of instructions delivered from the Front-end. ++ 0x1 extra:cmask=1,inv,edge empty_end Counts end of periods where the Reservation Station (RS) was empty. Could be useful to precisely locate Frontend Latency Bound issues. ++name:offcore_requests_outstanding type:exclusive default:0x1 ++ 0x1 extra: demand_data_rd Offcore outstanding Demand Data Read transactions in uncore queue. ++ 0x2 extra: demand_code_rd Offcore outstanding code reads transactions in SuperQueue (SQ), queue to uncore, every cycle ++ 0x4 extra: demand_rfo Offcore outstanding RFO store transactions in SuperQueue (SQ), queue to uncore ++ 0x8 extra: all_data_rd Offcore outstanding cacheable Core Data Read transactions in SuperQueue (SQ), queue to uncore ++ 0x1 extra:cmask=1 cycles_with_demand_data_rd Cycles when offcore outstanding Demand Data Read transactions are present in SuperQueue (SQ), queue to uncore ++ 0x8 extra:cmask=1 cycles_with_data_rd Cycles when offcore outstanding cacheable Core Data Read transactions are present in SuperQueue (SQ), queue to uncore + name:lock_cycles type:exclusive default:0x1 + 0x1 extra: split_lock_uc_lock_duration Cycles when L1 and L2 are locked due to UC or split lock + 0x2 extra: cache_lock_duration Cycles when L1D is locked +@@ -99,8 +114,8 @@ name:idq type:exclusive default:0x2 + 0x8 extra: dsb_uops Uops delivered to Instruction Decode Queue (IDQ) from the Decode Stream Buffer (DSB) path + 0x10 extra: ms_dsb_uops Uops initiated by Decode Stream Buffer (DSB) that are being delivered to Instruction Decode Queue (IDQ) while Microcode Sequenser (MS) is busy + 0x20 extra: ms_mite_uops Uops initiated by MITE and delivered to Instruction Decode Queue (IDQ) while Microcode Sequenser (MS) is busy +- 0x30 extra: ms_uops Uops delivered to Instruction Decode Queue (IDQ) while Microcode Sequenser (MS) is busy +- 0x30 extra:cmask=1 ms_cycles Cycles when uops are being delivered to Instruction Decode Queue (IDQ) while Microcode Sequenser (MS) is busy ++ 0x30 extra: ms_uops This event counts uops delivered by the Front-end with the assistance of the microcode sequencer. Microcode assists are used for complex instructions or scenarios that can't be handled by the standard decoder. Using other instructions, if possible, will usually improve performance. ++ 0x30 extra:cmask=1 ms_cycles This event counts cycles during which the microcode sequencer assisted the Front-end in delivering uops. Microcode assists are used for complex instructions or scenarios that can't be handled by the standard decoder. Using other instructions, if possible, will usually improve performance. + 0x4 extra:cmask=1 mite_cycles Cycles when uops are being delivered to Instruction Decode Queue (IDQ) from MITE path + 0x8 extra:cmask=1 dsb_cycles Cycles when uops are being delivered to Instruction Decode Queue (IDQ) from Decode Stream Buffer (DSB) path + 0x10 extra:cmask=1 ms_dsb_cycles Cycles when uops initiated by Decode Stream Buffer (DSB) are being delivered to Instruction Decode Queue (IDQ) while Microcode Sequenser (MS) is busy +@@ -110,17 +125,21 @@ name:idq type:exclusive default:0x2 + 0x24 extra:cmask=4 all_mite_cycles_4_uops Cycles MITE is delivering 4 Uops + 0x24 extra:cmask=1 all_mite_cycles_any_uops Cycles MITE is delivering any Uop + 0x3c extra: mite_all_uops Uops delivered to Instruction Decode Queue (IDQ) from MITE path ++ 0x30 extra:cmask=1,edge ms_switches Number of switches from DSB (Decode Stream Buffer) or MITE (legacy decode pipeline) to the Microcode Sequencer ++name:icache type:exclusive default:0x2 ++ 0x2 extra: misses This event counts Instruction Cache (ICACHE) misses. ++ 0x4 extra: ifetch_stall Cycles where a code-fetch stalled due to L1 instruction-cache miss or an iTLB miss + name:itlb_misses type:exclusive default:0x1 + 0x1 extra: miss_causes_a_walk Misses at all ITLB levels that cause page walks +- 0xe extra: walk_completed Misses in all ITLB levels that cause completed page walks + 0x2 extra: walk_completed_4k Code miss in all TLB levels causes a page walk that completes. (4K) + 0x4 extra: walk_completed_2m_4m Code miss in all TLB levels causes a page walk that completes. (2M/4M) +- 0x10 extra: walk_duration Cycles when PMH is busy with page walks +- 0x60 extra: stlb_hit Operations that miss the first ITLB level but hit the second and do not cause any page walks ++ 0x10 extra: walk_duration This event counts cycles when the page miss handler (PMH) is servicing page walks caused by ITLB misses. + 0x20 extra: stlb_hit_4k Core misses that miss the DTLB and hit the STLB (4K) + 0x40 extra: stlb_hit_2m Code misses that miss the DTLB and hit the STLB (2M) ++ 0xe extra: walk_completed Misses in all ITLB levels that cause completed page walks ++ 0x60 extra: stlb_hit Operations that miss the first ITLB level but hit the second and do not cause any page walks + name:ild_stall type:exclusive default:0x1 +- 0x1 extra: lcp Stalls caused by changing prefix length of the instruction. ++ 0x1 extra: lcp This event counts cycles where the decoder is stalled on an instruction with a length changing prefix (LCP). + 0x4 extra: iq_full Stall cycles because IQ is full + name:br_inst_exec type:exclusive default:0xff + 0xff extra: all_branches Speculative and retired branches +@@ -145,14 +164,14 @@ name:br_misp_exec type:exclusive default:0xff + 0xc1 extra: all_conditional Speculative and retired mispredicted macro conditional branches + 0xc4 extra: all_indirect_jump_non_call_ret Mispredicted indirect branches excluding calls and returns + 0xa0 extra: taken_indirect_near_call Taken speculative and retired mispredicted indirect calls +-name:idq_uops_not_delivered type:exclusive default:core +- 0x1 extra: core Uops not delivered to Resource Allocation Table (RAT) per thread when backend of the machine is not stalled +- 0x1 extra:cmask=4 cycles_0_uops_deliv_core Cycles per thread when 4 or more uops are not delivered to Resource Allocation Table (RAT) when backend of the machine is not stalled +- 0x1 extra:cmask=3 cycles_le_1_uop_deliv_core Cycles per thread when 3 or more uops are not delivered to Resource Allocation Table (RAT) when backend of the machine is not stalled ++name:idq_uops_not_delivered type:exclusive default:0x1 ++ 0x1 extra: core This event count the number of undelivered (unallocated) uops from the Front-end to the Resource Allocation Table (RAT) while the Back-end of the processor is not stalled. The Front-end can allocate up to 4 uops per cycle so this event can increment 0-4 times per cycle depending on the number of unallocated uops. This event is counted on a per-core basis. ++ 0x1 extra:cmask=4 cycles_0_uops_deliv_core This event counts the number cycles during which the Front-end allocated exactly zero uops to the Resource Allocation Table (RAT) while the Back-end of the processor is not stalled. This event is counted on a per-core basis. ++ 0x1 extra:cmask=3 cycles_le_1_uop_deliv_core Cycles per thread when 3 or more uops are not delivered to Resource Allocation Table (RAT) when backend of the machine is not stalled + 0x1 extra:cmask=2 cycles_le_2_uop_deliv_core Cycles with less than 2 uops delivered by the front end. + 0x1 extra:cmask=1 cycles_le_3_uop_deliv_core Cycles with less than 3 uops delivered by the front end. + 0x1 extra:cmask=1,inv cycles_fe_was_ok Counts cycles FE delivered 4 uops or Resource Allocation Table (RAT) was stalling FE. +-name:uops_executed_port type:exclusive default:port_0 ++name:uops_executed_port type:exclusive default:0x1 + 0x1 extra: port_0 Cycles per thread when uops are executed in port 0 + 0x2 extra: port_1 Cycles per thread when uops are executed in port 1 + 0x4 extra: port_2 Cycles per thread when uops are executed in port 2 +@@ -172,88 +191,100 @@ name:uops_executed_port type:exclusive default:port_0 + name:resource_stalls type:exclusive default:0x1 + 0x1 extra: any Resource-related stall cycles + 0x4 extra: rs Cycles stalled due to no eligible RS entry available. +- 0x8 extra: sb Cycles stalled due to no store buffers available. (not including draining form sync). ++ 0x8 extra: sb This event counts cycles during which no instructions were allocated because no Store Buffers (SB) were available. + 0x10 extra: rob Cycles stalled due to re-order buffer full. +-name:cycle_activity type:exclusive default:0x8 ++name:cycle_activity type:exclusive default:0x1 ++ 0x1 extra:cmask=1 cycles_l2_pending Cycles with pending L2 cache miss loads. + 0x8 extra:cmask=8 cycles_l1d_pending Cycles with pending L1 cache miss loads. + 0x2 extra:cmask=2 cycles_ldm_pending Cycles with pending memory loads. +- 0x4 extra:cmask=4 cycles_no_execute Total execution stalls +- 0x6 extra:cmask=6 stalls_ldm_pending Execution stalls due to memory subsystem. +-name:offcore_requests type:exclusive default:0x2 ++ 0x4 extra:cmask=4 cycles_no_execute This event counts cycles during which no instructions were executed in the execution stage of the pipeline. ++ 0x5 extra:cmask=5 stalls_l2_pending Execution stalls due to L2 cache misses. ++ 0x6 extra:cmask=6 stalls_ldm_pending This event counts cycles during which no instructions were executed in the execution stage of the pipeline and there were memory instructions pending (waiting for data). ++ 0xc extra:cmask=c stalls_l1d_pending Execution stalls due to L1 data cache misses ++name:offcore_requests type:exclusive default:0x1 ++ 0x1 extra: demand_data_rd Demand Data Read requests sent to uncore + 0x2 extra: demand_code_rd Cacheable and noncachaeble code read requests + 0x4 extra: demand_rfo Demand RFO requests including regular RFOs, locks, ItoM + 0x8 extra: all_data_rd Demand and prefetch data reads +-name:uops_executed type:exclusive default:thread +- 0x1 extra: thread Counts the number of uops to be executed per-thread each cycle. +- 0x2 extra: core Number of uops executed on the core. ++name:uops_executed type:exclusive default:0x2 ++ 0x2 extra: core Number of uops executed on the core. Errata: HSM31 + 0x1 extra:cmask=1,inv stall_cycles Counts number of cycles no uops were dispatched to be executed on this thread. +- 0x1 extra:cmask=1,inv cycles_ge_1_uop_exec Cycles where at least 1 uop was executed per-thread +- 0x1 extra:cmask=1,inv cycles_ge_2_uops_exec Cycles where at least 2 uops were executed per-thread +- 0x1 extra:cmask=1,inv cycles_ge_3_uops_exec Cycles where at least 3 uops were executed per-thread +- 0x1 extra:cmask=1,inv cycles_ge_4_uops_exec Cycles where at least 4 uops were executed per-thread ++ 0x1 extra:cmask=1 cycles_ge_1_uops_exec This events counts the cycles where at least one uop was executed. It is counted per thread. Errata: HSM31 ++ 0x1 extra:cmask=2 cycles_ge_2_uops_exec This events counts the cycles where at least two uop were executed. It is counted per thread. Errata: HSM31 ++ 0x1 extra:cmask=3 cycles_ge_3_uops_exec This events counts the cycles where at least three uop were executed. It is counted per thread. Errata: HSM31 ++ 0x1 extra:cmask=4 cycles_ge_4_uops_exec Cycles where at least 4 uops were executed per-thread Errata: HSM31 + name:page_walker_loads type:exclusive default:0x11 +- 0x11 extra: ia32_dtlb_l1 Number of DTLB page walker hits in the L1+FB +- 0x21 extra: ia32_itlb_l1 Number of ITLB page walker hits in the L1+FB +- 0x12 extra: ia32_dtlb_l2 Number of DTLB page walker hits in the L2 +- 0x22 extra: ia32_itlb_l2 Number of ITLB page walker hits in the L2 +- 0x14 extra: ia32_dtlb_l3 Number of DTLB page walker hits in the L3 + XSNP +- 0x24 extra: ia32_itlb_l3 Number of ITLB page walker hits in the L3 + XSNP +- 0x18 extra: ia32_dtlb_memory Number of DTLB page walker hits in Memory +- 0x28 extra: ia32_itlb_memory Number of ITLB page walker hits in Memory ++ 0x11 extra: dtlb_l1 Number of DTLB page walker hits in the L1+FB ++ 0x21 extra: itlb_l1 Number of ITLB page walker hits in the L1+FB ++ 0x41 extra: ept_dtlb_l1 Counts the number of Extended Page Table walks from the DTLB that hit in the L1 and FB. ++ 0x81 extra: ept_itlb_l1 Counts the number of Extended Page Table walks from the ITLB that hit in the L1 and FB. ++ 0x12 extra: dtlb_l2 Number of DTLB page walker hits in the L2 ++ 0x22 extra: itlb_l2 Number of ITLB page walker hits in the L2 ++ 0x42 extra: ept_dtlb_l2 Counts the number of Extended Page Table walks from the DTLB that hit in the L2. ++ 0x82 extra: ept_itlb_l2 Counts the number of Extended Page Table walks from the ITLB that hit in the L2. ++ 0x14 extra: dtlb_l3 Number of DTLB page walker hits in the L3 + XSNP ++ 0x24 extra: itlb_l3 Number of ITLB page walker hits in the L3 + XSNP ++ 0x44 extra: ept_dtlb_l3 Counts the number of Extended Page Table walks from the DTLB that hit in the L3. ++ 0x84 extra: ept_itlb_l3 Counts the number of Extended Page Table walks from the ITLB that hit in the L2. ++ 0x18 extra: dtlb_memory Number of DTLB page walker hits in Memory ++ 0x48 extra: ept_dtlb_memory Counts the number of Extended Page Table walks from the DTLB that hit in memory. ++ 0x88 extra: ept_itlb_memory Counts the number of Extended Page Table walks from the ITLB that hit in memory. + name:tlb_flush type:exclusive default:0x1 + 0x1 extra: dtlb_thread DTLB flush attempts of the thread-specific entries + 0x20 extra: stlb_any STLB flush attempts + name:other_assists type:exclusive default:0x8 +- 0x8 extra: avx_to_sse Number of transitions from AVX-256 to legacy SSE when penalty applicable. +- 0x10 extra: sse_to_avx Number of transitions from SSE to AVX-256 when penalty applicable. ++ 0x8 extra: avx_to_sse Number of transitions from AVX-256 to legacy SSE when penalty applicable. Errata: HSM57 ++ 0x10 extra: sse_to_avx Number of transitions from SSE to AVX-256 when penalty applicable. Errata: HSM57 + 0x40 extra: any_wb_assist Number of times any microcode assist is invoked by HW upon uop writeback. +-name:uops_retired type:exclusive default:all +- 0x1 extra: all Actually retired uops. +- 0x2 extra: retire_slots Retirement slots used. +- 0x1 extra:pebs all_ps Actually retired uops. (Precise Event - PEBS) +- 0x2 extra:pebs retire_slots_ps Retirement slots used. (Precise Event - PEBS) +- 0x1 extra:cmask=1,inv stall_cycles Cycles without actually retired uops. +- 0x1 extra:cmask=10,inv total_cycles Cycles with less than 10 actually retired uops. +- 0x1 extra:cmask=1,inv,any core_stall_cycles Cycles without actually retired uops. +-name:machine_clears type:exclusive default:0x2 +- 0x2 extra: memory_ordering Counts the number of machine clears due to memory order conflicts. +- 0x4 extra: smc Self-modifying code (SMC) detected. +- 0x20 extra: maskmov This event counts the number of executed Intel AVX masked load operations that refer to an illegal address range with the mask bits set to 0. +-name:br_inst_retired type:exclusive default:all_branches_ps +- 0x1 extra: conditional Conditional branch instructions retired. +- 0x2 extra: near_call Direct and indirect near call instructions retired. +- 0x8 extra: near_return Return instructions retired. +- 0x10 extra: not_taken Not taken branch instructions retired. +- 0x20 extra: near_taken Taken branch instructions retired. +- 0x40 extra: far_branch Far branch instructions retired. +- 0x1 extra:pebs conditional_ps Conditional branch instructions retired. (Precise Event - PEBS) +- 0x2 extra:pebs near_call_ps Direct and indirect near call instructions retired. (Precise Event - PEBS) +- 0x4 extra:pebs all_branches_ps All (macro) branch instructions retired. (Precise Event - PEBS) +- 0x8 extra:pebs near_return_ps Return instructions retired. (Precise Event - PEBS) +- 0x20 extra:pebs near_taken_ps Taken branch instructions retired. (Precise Event - PEBS) +- 0x2 extra: near_call_r3 Direct and indirect macro near call instructions retired (captured in ring 3). +- 0x2 extra:pebs near_call_r3_ps Direct and indirect macro near call instructions retired (captured in ring 3). (Precise Event - PEBS) +-name:br_misp_retired type:exclusive default:all_branches_ps +- 0x1 extra: conditional Mispredicted conditional branch instructions retired. +- 0x1 extra:pebs conditional_ps Mispredicted conditional branch instructions retired. (Precise Event - PEBS) +- 0x4 extra:pebs all_branches_ps Mispredicted macro branch instructions retired. (Precise Event - PEBS) +- 0x20 extra: near_taken number of near branch instructions retired that were mispredicted and taken. +- 0x20 extra:pebs near_taken_ps number of near branch instructions retired that were mispredicted and taken. (Precise Event - PEBS) ++name:uops_retired type:exclusive default:0x1 ++ 0x1 extra: all Actually retired uops. ++ 0x1 extra: all_pebs Actually retired uops. ++ 0x2 extra: retire_slots This event counts the number of retirement slots used each cycle. There are potentially 4 slots that can be used each cycle - meaning, 4 uops or 4 instructions could retire each cycle. ++ 0x2 extra: retire_slots_pebs This event counts the number of retirement slots used each cycle. There are potentially 4 slots that can be used each cycle - meaning, 4 uops or 4 instructions could retire each cycle. ++ 0x1 extra:cmask=1,inv stall_cycles Cycles without actually retired uops. ++ 0x1 extra:cmask=a,inv total_cycles Cycles with less than 10 actually retired uops. ++ 0x1 extra:cmask=1,inv core_stall_cycles Cycles without actually retired uops. ++name:machine_clears type:exclusive default:0x1 ++ 0x1 extra: cycles Cycles there was a Nuke. Account for both thread-specific and All Thread Nukes. ++ 0x2 extra: memory_ordering This event counts the number of memory ordering machine clears detected. Memory ordering machine clears can result from memory address aliasing or snoops from another hardware thread or core to data inflight in the pipeline. Machine clears can have a significant performance impact if they are happening frequently. ++ 0x4 extra: smc This event is incremented when self-modifying code (SMC) is detected, which causes a machine clear. Machine clears can have a significant performance impact if they are happening frequently. ++ 0x20 extra: maskmov This event counts the number of executed Intel AVX masked load operations that refer to an illegal address range with the mask bits set to 0. ++ 0x1 extra:cmask=1,edge count Number of machine clears (nukes) of any type. ++name:br_inst_retired type:exclusive default:0x1 ++ 0x1 extra: conditional Conditional branch instructions retired. ++ 0x1 extra: conditional_pebs Conditional branch instructions retired. ++ 0x2 extra: near_call Direct and indirect near call instructions retired. ++ 0x2 extra: near_call_pebs Direct and indirect near call instructions retired. ++ 0x8 extra: near_return Return instructions retired. ++ 0x8 extra: near_return_pebs Return instructions retired. ++ 0x10 extra: not_taken Not taken branch instructions retired. ++ 0x20 extra: near_taken Taken branch instructions retired. ++ 0x20 extra: near_taken_pebs Taken branch instructions retired. ++ 0x40 extra: far_branch Far branch instructions retired. ++ 0x4 extra:pebs all_branches_pebs All (macro) branch instructions retired. ++name:br_misp_retired type:exclusive default:0x1 ++ 0x1 extra: conditional Mispredicted conditional branch instructions retired. ++ 0x1 extra: conditional_pebs Mispredicted conditional branch instructions retired. ++ 0x4 extra:pebs all_branches_pebs This event counts all mispredicted branch instructions retired. This is a precise event. ++ 0x20 extra: near_taken number of near branch instructions retired that were mispredicted and taken. ++ 0x20 extra: near_taken_pebs number of near branch instructions retired that were mispredicted and taken. + name:hle_retired type:exclusive default:0x1 + 0x1 extra: start Number of times an HLE execution started. + 0x2 extra: commit Number of times an HLE execution successfully committed +- 0x4 extra: aborted Number of times an HLE execution aborted due to any reasons (multiple categories may count as one) +- 0x8 extra: aborted_misc1 Number of times an HLE execution aborted due to 1 various memory events ++ 0x4 extra: aborted Number of times an HLE execution aborted due to any reasons (multiple categories may count as one). ++ 0x4 extra: aborted_pebs Number of times an HLE execution aborted due to any reasons (multiple categories may count as one). ++ 0x8 extra: aborted_misc1 Number of times an HLE execution aborted due to various memory events (e.g., read/write capacity and conflicts). + 0x10 extra: aborted_misc2 Number of times an HLE execution aborted due to uncommon conditions + 0x20 extra: aborted_misc3 Number of times an HLE execution aborted due to HLE-unfriendly instructions + 0x40 extra: aborted_misc4 Number of times an HLE execution aborted due to incompatible memory type +- 0x80 extra: aborted_misc5 Number of times an HLE execution aborted due to none of the previous categories (e.g. interrupt) ++ 0x80 extra: aborted_misc5 Number of times an HLE execution aborted due to none of the previous 4 categories (e.g. interrupts) + name:rtm_retired type:exclusive default:0x1 + 0x1 extra: start Number of times an RTM execution started. + 0x2 extra: commit Number of times an RTM execution successfully committed +- 0x4 extra: aborted Number of times an RTM execution aborted due to any reasons (multiple categories may count as one) +- 0x8 extra: aborted_misc1 Number of times an RTM execution aborted due to various memory events +- 0x10 extra: aborted_misc2 Number of times an RTM execution aborted due to uncommon conditions ++ 0x4 extra: aborted Number of times an RTM execution aborted due to any reasons (multiple categories may count as one). ++ 0x4 extra: aborted_pebs Number of times an RTM execution aborted due to any reasons (multiple categories may count as one). ++ 0x8 extra: aborted_misc1 Number of times an RTM execution aborted due to various memory events (e.g. read/write capacity and conflicts) ++ 0x10 extra: aborted_misc2 Number of times an RTM execution aborted due to various memory events (e.g., read/write capacity and conflicts). + 0x20 extra: aborted_misc3 Number of times an RTM execution aborted due to HLE-unfriendly instructions + 0x40 extra: aborted_misc4 Number of times an RTM execution aborted due to incompatible memory type + 0x80 extra: aborted_misc5 Number of times an RTM execution aborted due to none of the previous 4 categories (e.g. interrupt) +@@ -263,51 +294,59 @@ name:fp_assist type:exclusive default:0x1e + 0x4 extra: x87_input Number of X87 assists due to input value. + 0x8 extra: simd_output Number of SIMD FP assists due to Output values + 0x10 extra: simd_input Number of SIMD FP assists due to input values +-name:mem_uops_retired type:exclusive default:all_loads +- 0x11 extra: stlb_miss_loads Load uops with true STLB miss retired to architected path. +- 0x12 extra: stlb_miss_stores Store uops with true STLB miss retired to architected path. +- 0x21 extra: lock_loads Load uops with locked access retired to architected path. +- 0x41 extra: split_loads Line-splitted load uops retired to architected path. +- 0x42 extra: split_stores Line-splitted store uops retired to architected path. +- 0x81 extra: all_loads Load uops retired to architected path with filter on bits 0 and 1 applied. +- 0x82 extra: all_stores Store uops retired to architected path with filter on bits 0 and 1 applied. +- 0x11 extra:pebs stlb_miss_loads_ps Load uops with true STLB miss retired to architected path. (Precise Event - PEBS) +- 0x12 extra:pebs stlb_miss_stores_ps Store uops true STLB miss retired to architected path. (Precise Event - PEBS) +- 0x21 extra:pebs lock_loads_ps Load uops with locked access retired to architected path. (Precise Event - PEBS) +- 0x41 extra:pebs split_loads_ps Line-splitted load uops retired to architected path. (Precise Event - PEBS) +- 0x42 extra:pebs split_stores_ps Line-splitted store uops retired to architected path. (Precise Event - PEBS) +- 0x81 extra:pebs all_loads_ps Load uops retired to architected path with filter on bits 0 and 1 applied. (Precise Event - PEBS) +- 0x82 extra:pebs all_stores_ps Store uops retired to architected path with filter on bits 0 and 1 applied. (Precise Event - PEBS) +-name:mem_load_uops_retired type:exclusive default:l1_hit +- 0x1 extra: l1_hit Retired load uops with L1 cache hits as data sources. +- 0x2 extra: l2_hit Retired load uops with L2 cache hits as data sources. +- 0x4 extra: l3_hit Retired load uops which data sources were data hits in LLC without snoops required. +- 0x10 extra: l2_miss Miss in mid-level (L2) cache. Excludes Unknown data-source. +- 0x40 extra: hit_lfb Retired load uops which data sources were load uops missed L1 but hit FB due to preceding miss to the same cache line with data not ready. +- 0x1 extra:pebs l1_hit_ps Retired load uops with L1 cache hits as data sources. (Precise Event - PEBS) +- 0x2 extra:pebs l2_hit_ps Retired load uops with L2 cache hits as data sources. (Precise Event - PEBS) +- 0x4 extra:pebs l3_hit_ps Miss in last-level (L3) cache. Excludes Unknown data-source. (Precise Event - PEBS) +- 0x40 extra:pebs hit_lfb_ps Retired load uops which data sources were load uops missed L1 but hit FB due to preceding miss to the same cache line with data not ready. (Precise Event - PEBS) +-name:mem_load_uops_l3_hit_retired type:exclusive default:xsnp_miss +- 0x1 extra: xsnp_miss Retired load uops which data sources were LLC hit and cross-core snoop missed in on-pkg core cache. +- 0x2 extra: xsnp_hit Retired load uops which data sources were LLC and cross-core snoop hits in on-pkg core cache. +- 0x4 extra: xsnp_hitm Retired load uops which data sources were HitM responses from shared LLC. +- 0x8 extra: xsnp_none Retired load uops which data sources were hits in LLC without snoops required. +- 0x1 extra:pebs xsnp_miss_ps Retired load uops which data sources were LLC hit and cross-core snoop missed in on-pkg core cache. (Precise Event - PEBS) +- 0x2 extra:pebs xsnp_hit_ps Retired load uops which data sources were LLC and cross-core snoop hits in on-pkg core cache. (Precise Event - PEBS) +- 0x4 extra:pebs xsnp_hitm_ps Retired load uops which data sources were HitM responses from shared LLC. (Precise Event - PEBS) +- 0x8 extra:pebs xsnp_none_ps Retired load uops which data sources were hits in LLC without snoops required. (Precise Event - PEBS) ++name:mem_uops_retired type:exclusive default:0x11 ++ 0x11 extra: stlb_miss_loads Load uops with true STLB miss retired to architected path. Errata: HSM30 ++ 0x11 extra: stlb_miss_loads_pebs Load uops with true STLB miss retired to architected path. Errata: HSM30 ++ 0x12 extra: stlb_miss_stores Store uops with true STLB miss retired to architected path. Errata: HSM30 ++ 0x12 extra: stlb_miss_stores_pebs Store uops with true STLB miss retired to architected path. Errata: HSM30 ++ 0x21 extra: lock_loads Load uops with locked access retired to architected path. Errata: HSM30 ++ 0x21 extra: lock_loads_pebs Load uops with locked access retired to architected path. Errata: HSM30 ++ 0x41 extra: split_loads Line-splitted load uops retired to architected path. Errata: HSM30 ++ 0x41 extra: split_loads_pebs Line-splitted load uops retired to architected path. Errata: HSM30 ++ 0x42 extra: split_stores Line-splitted store uops retired to architected path. Errata: HSM30 ++ 0x42 extra: split_stores_pebs Line-splitted store uops retired to architected path. Errata: HSM30 ++ 0x81 extra: all_loads Load uops retired to architected path with filter on bits 0 and 1 applied. Errata: HSM30 ++ 0x81 extra: all_loads_pebs Load uops retired to architected path with filter on bits 0 and 1 applied. Errata: HSM30 ++ 0x82 extra: all_stores Store uops retired to architected path with filter on bits 0 and 1 applied. Errata: HSM30 ++ 0x82 extra: all_stores_pebs Store uops retired to architected path with filter on bits 0 and 1 applied. Errata: HSM30 ++name:mem_load_uops_retired type:exclusive default:0x1 ++ 0x1 extra: l1_hit Retired load uops with L1 cache hits as data sources. Errata: HSM30 ++ 0x1 extra: l1_hit_pebs Retired load uops with L1 cache hits as data sources. Errata: HSM30 ++ 0x2 extra: l2_hit Retired load uops with L2 cache hits as data sources. Errata: HSM30 ++ 0x2 extra: l2_hit_pebs Retired load uops with L2 cache hits as data sources. Errata: HSM30 ++ 0x4 extra: l3_hit Retired load uops which data sources were data hits in L3 without snoops required. Errata: HSM26, HSM30 ++ 0x4 extra: l3_hit_pebs Retired load uops which data sources were data hits in L3 without snoops required. Errata: HSM26, HSM30 ++ 0x8 extra: l1_miss Retired load uops misses in L1 cache as data sources. Errata: HSM30 ++ 0x8 extra: l1_miss_pebs Retired load uops misses in L1 cache as data sources. Errata: HSM30 ++ 0x10 extra: l2_miss Miss in mid-level (L2) cache. Excludes Unknown data-source. Errata: HSM30 ++ 0x10 extra: l2_miss_pebs Miss in mid-level (L2) cache. Excludes Unknown data-source. Errata: HSM30 ++ 0x20 extra: l3_miss Miss in last-level (L3) cache. Excludes Unknown data-source. Errata: HSM26, HSM30 ++ 0x20 extra: l3_miss_pebs Miss in last-level (L3) cache. Excludes Unknown data-source. Errata: HSM26, HSM30 ++ 0x40 extra: hit_lfb Retired load uops which data sources were load uops missed L1 but hit FB due to preceding miss to the same cache line with data not ready. Errata: HSM30 ++ 0x40 extra: hit_lfb_pebs Retired load uops which data sources were load uops missed L1 but hit FB due to preceding miss to the same cache line with data not ready. Errata: HSM30 ++name:mem_load_uops_l3_hit_retired type:exclusive default:0x1 ++ 0x1 extra: xsnp_miss Retired load uops which data sources were L3 hit and cross-core snoop missed in on-pkg core cache. Errata: HSM26, HSM30 ++ 0x1 extra: xsnp_miss_pebs Retired load uops which data sources were L3 hit and cross-core snoop missed in on-pkg core cache. Errata: HSM26, HSM30 ++ 0x2 extra: xsnp_hit Retired load uops which data sources were L3 and cross-core snoop hits in on-pkg core cache. Errata: HSM26, HSM30 ++ 0x2 extra: xsnp_hit_pebs Retired load uops which data sources were L3 and cross-core snoop hits in on-pkg core cache. Errata: HSM26, HSM30 ++ 0x4 extra: xsnp_hitm Retired load uops which data sources were HitM responses from shared L3. Errata: HSM26, HSM30 ++ 0x4 extra: xsnp_hitm_pebs Retired load uops which data sources were HitM responses from shared L3. Errata: HSM26, HSM30 ++ 0x8 extra: xsnp_none Retired load uops which data sources were hits in L3 without snoops required. Errata: HSM26, HSM30 ++ 0x8 extra: xsnp_none_pebs Retired load uops which data sources were hits in L3 without snoops required. Errata: HSM26, HSM30 ++name:mem_load_uops_l3_miss_retired type:exclusive default:0x1 ++ 0x1 extra: local_dram This event counts retired load uops where the data came from local DRAM. This does not include hardware prefetches. Errata: HSM30 ++ 0x1 extra: local_dram_pebs This event counts retired load uops where the data came from local DRAM. This does not include hardware prefetches. Errata: HSM30 + name:l2_trans type:exclusive default:0x80 + 0x80 extra: all_requests Transactions accessing L2 pipe + 0x1 extra: demand_data_rd Demand Data Read requests that access L2 cache + 0x2 extra: rfo RFO requests that access L2 cache + 0x4 extra: code_rd L2 cache accesses when fetching instructions +- 0x8 extra: all_pf L2 or LLC HW prefetches that access L2 cache ++ 0x8 extra: all_pf L2 or L3 HW prefetches that access L2 cache + 0x10 extra: l1d_wb L1D writebacks that access L2 cache + 0x20 extra: l2_fill L2 fill requests that access L2 cache + 0x40 extra: l2_wb L2 writebacks that access L2 cache + name:l2_lines_in type:exclusive default:0x7 +- 0x7 extra: all L2 cache lines filling L2 ++ 0x7 extra: all This event counts the number of L2 cache lines brought into the L2 cache. Lines are filled into the L2 cache when there was an L2 miss. + 0x1 extra: i L2 cache lines in I state filling L2 + 0x2 extra: s L2 cache lines in S state filling L2 + 0x4 extra: e L2 cache lines in E state filling L2 diff --git a/SOURCES/oprofile-hugepage.patch b/SOURCES/oprofile-hugepage.patch new file mode 100644 index 0000000..111ba1d --- /dev/null +++ b/SOURCES/oprofile-hugepage.patch @@ -0,0 +1,34 @@ +From 0246c6ba4a08378c46c17617d831d6baf0f44989 Mon Sep 17 00:00:00 2001 +From: William Cohen +Date: Fri, 9 Jan 2015 16:44:09 -0500 +Subject: [PATCH] Allow operf to track anon_hugepage mmap entries + +The perf mmap information for anon_huge pages has a different filename +("/anon_hugepage") than the mmap information for regions composed of +normal sized pages ("//anon"). This results in opreport not being +able to map samples collected by operf to Java methods when the Java +VM uses statically allocated huge pages (rhbz1180512 and rhbz1180513). + +Signed-off-by: William Cohen +--- + libperf_events/operf_utils.cpp | 4 ++++ + 1 file changed, 4 insertions(+) + +diff --git a/libperf_events/operf_utils.cpp b/libperf_events/operf_utils.cpp +index a87524b..90a0765 100644 +--- a/libperf_events/operf_utils.cpp ++++ b/libperf_events/operf_utils.cpp +@@ -295,6 +295,10 @@ static void __handle_mmap_event(event_t * event) + strlen("//anon")) == 0)) { + mapping->is_anon_mapping = true; + strcpy(mapping->filename, "anon"); ++ } else if ((strncmp(mapping->filename, "/anon_hugepage", ++ strlen("/anon_hugepage")) == 0)) { ++ mapping->is_anon_mapping = true; ++ strcpy(mapping->filename, "anon"); + } + mapping->end_addr = (event->mmap.len == 0ULL)? 0ULL : mapping->start_addr + event->mmap.len - 1; + mapping->pgoff = event->mmap.pgoff; +-- +2.1.0 + diff --git a/SOURCES/oprofile-intelcpuid.patch b/SOURCES/oprofile-intelcpuid.patch new file mode 100644 index 0000000..3bb0201 --- /dev/null +++ b/SOURCES/oprofile-intelcpuid.patch @@ -0,0 +1,27 @@ +commit a154a6ba3477c9cb51e2c225e6434909bb41a60a +Author: Andi Kleen +Date: Thu Jun 11 13:54:59 2015 -0700 + + oprofile: Add Intel Airmont and Intel Xeon D model numbers + + Add a model number for Airmont/Braswell CPUs + Add a model number for Broadwell based Xeon D CPUs. + + Signed-off-by: Andi Kleen + +diff --git a/libop/op_hw_specific.h b/libop/op_hw_specific.h +index 1d39692..8a7ed1c 100644 +--- a/libop/op_hw_specific.h ++++ b/libop/op_hw_specific.h +@@ -152,9 +152,11 @@ static inline op_cpu op_cpu_specific_type(op_cpu cpu_type) + case 0x3d: + case 0x47: + case 0x4f: ++ case 0x56: + return CPU_BROADWELL; + case 0x37: + case 0x4d: ++ case 0x4c: + return CPU_SILVERMONT; + } + } diff --git a/SOURCES/oprofile-maskarray.patch b/SOURCES/oprofile-maskarray.patch new file mode 100644 index 0000000..2851b63 --- /dev/null +++ b/SOURCES/oprofile-maskarray.patch @@ -0,0 +1,33 @@ +commit ef501aa609f49c06df9f33c9a7330dffd71b31b3 +Author: Maynard Johnson +Date: Thu Oct 31 11:11:06 2013 -0500 + + Fix handling of default named unit masks longer than 11 chars + + The handling of default unit masks that are names instead of hex + values is new with oprofile 0.9.9. I've discovered a bug in this + handling when the name exceeds 11 characters. For example, on + Sandybridge, the following ocount command fails: + + [mpjohn@oc1757000783 test-stuff]$ ocount -e l1d_blocks ls + Cannot find unit mask bank_confli for l1d_blocks + Unable to find unit mask info for bank_confli for event l1d_blocks + + This problem was due to the char array ('mask') being too small. + + Signed-off-by: Maynard Johnson + +diff --git a/libpe_utils/op_pe_utils.cpp b/libpe_utils/op_pe_utils.cpp +index 177835e..9e2addb 100644 +--- a/libpe_utils/op_pe_utils.cpp ++++ b/libpe_utils/op_pe_utils.cpp +@@ -413,8 +413,8 @@ static void _get_event_code(operf_event_t * event, op_cpu cpu_type) + + + #if defined(__i386__) || defined(__x86_64__) ++ char mask[OP_MAX_UM_NAME_LEN]; + // Setup EventSelct[11:8] field for AMD +- char mask[12]; + const char * vendor_AMD = "AuthenticAMD"; + if (op_is_cpu_vendor((char *)vendor_AMD)) { + config = base_code & 0xF00ULL; diff --git a/SOURCES/oprofile-num_symbolic.patch b/SOURCES/oprofile-num_symbolic.patch new file mode 100644 index 0000000..c081a22 --- /dev/null +++ b/SOURCES/oprofile-num_symbolic.patch @@ -0,0 +1,128 @@ +From 6f10a5b14f5b7f43568d109633533a8ecc057fc6 Mon Sep 17 00:00:00 2001 +From: Lars Friend +Date: Tue, 15 Oct 2013 01:14:53 -0400 +Subject: [PATCH] Allow events with extra flags to also set unit_mask + +Older distributions may be running kernels that still use the +/dev/opcontrol interface. On an Intel Ivy Bridge machine and similar +processors may want to do something like: + +opcontrol --setup --no-vmlinux \ + --event CPU_CLK_UNHALTED:2000000:0:0:1 \ + --event uops_executed:2000000:stall_cycles:0:1 + +For the uops_executed event in the above example need to both set the +extra and the unit_mask bits. The current code in opcontrol would +never set the unit_mask bits when the extra bits were set. This +change allows both to be set when required. + +Signed-off-by: William Cohen +--- + doc/ophelp.1.in | 4 ++++ + utils/opcontrol | 9 +++++++-- + utils/ophelp.c | 27 ++++++++++++++++++++++++++- + 3 files changed, 37 insertions(+), 3 deletions(-) + +diff --git a/doc/ophelp.1.in b/doc/ophelp.1.in +index 083cc85..97383bf 100644 +--- a/doc/ophelp.1.in ++++ b/doc/ophelp.1.in +@@ -49,6 +49,10 @@ Show the default unit mask for the given event. + Show the default unit mask for the given event. + .br + .TP ++.BI "--symbolic-unit-mask / -U [event]" ++Show the numerical unit and extra mask for given event. ++.br ++.TP + .BI "--extra-mask / -E [event]" + Show the extra unit mask for given event. + .br +diff --git a/utils/opcontrol b/utils/opcontrol +index 38bb1ac..a3a6a3c 100644 +--- a/utils/opcontrol ++++ b/utils/opcontrol +@@ -1522,9 +1522,14 @@ do_param_setup() + set_ctr_param $CTR count $COUNT + set_ctr_param $CTR kernel $KERNEL + set_ctr_param $CTR user $USER +- set_ctr_param $CTR unit_mask $UNIT_MASK + +- EXTRA=`$OPHELP --extra-mask $EVENT:$COUNT:$UNIT_MASK_NAMED` ++ # Resolve a [potentially] symbolic unit mask to a numeric ++ # unit mask and extra mask. ++ TMP_SYMBOLIC="`$OPHELP --symbolic-unit-mask $EVENT:$COUNT:$UNIT_MASK`" ++ UNIT_MASK_NUM=`echo $TMP_SYMBOLIC | awk '{print $1}'` ++ EXTRA=`echo $TMP_SYMBOLIC | awk '{print $2}'` ++ set_ctr_param $CTR unit_mask $UNIT_MASK_NUM ++ + if test "$EXTRA" -ne 0 ; then + # A value >= 0x40000 returned by 'ophelp --extra-mask' (EXTRA_MIN_VAL) is interpreted + # as a valid extra value; otherwise we interpret as a simple unit mask value +diff --git a/utils/ophelp.c b/utils/ophelp.c +index 7543c6f..f77a19a 100644 +--- a/utils/ophelp.c ++++ b/utils/ophelp.c +@@ -282,6 +282,22 @@ static void resolve_events(void) + free(counter_map); + } + ++static void resolve_symbolic_unit_mask(void) ++{ ++ size_t count; ++ unsigned extra = 0; ++ ++ count = parse_events(parsed_events, num_chosen_events, chosen_events, ++ ignore_count ? 0 : 1); ++ if (count > 1) { ++ fprintf(stderr, "More than one event specified.\n"); ++ exit(EXIT_FAILURE); ++ } ++ ++ op_resolve_unit_mask(parsed_events, &extra); ++ ++ printf("%d %d\n", parsed_events[0].unit_mask, extra); ++} + + static void show_unit_mask(void) + { +@@ -334,6 +349,7 @@ static int check_events; + static int unit_mask; + static int get_default_event; + static int extra_mask; ++static int symbolic_unit_mask; + + static struct poptOption options[] = { + { "cpu-type", 'c', POPT_ARG_STRING, &cpu_string, 0, +@@ -356,6 +372,9 @@ static struct poptOption options[] = { + "list events as XML", NULL, }, + { "extra-mask", 'E', POPT_ARG_NONE, &extra_mask, 0, + "print extra mask for event", NULL, }, ++ { "symbolic-unit-mask", 'U', POPT_ARG_NONE, &symbolic_unit_mask, 0, ++ "resolve an event with symbolic unit mask into numeric unit " ++ "and extra masks", NULL, }, + POPT_AUTOHELP + { NULL, 0, 0, NULL, 0, NULL, NULL, }, + }; +@@ -457,11 +476,17 @@ int main(int argc, char const * argv[]) + + events = op_events(cpu_type); + +- if (!chosen_events && (unit_mask || check_events || extra_mask)) { ++ if (!chosen_events && (unit_mask || check_events || extra_mask || ++ symbolic_unit_mask)) { + fprintf(stderr, "No events given.\n"); + exit(EXIT_FAILURE); + } + ++ if (symbolic_unit_mask) { ++ resolve_symbolic_unit_mask(); ++ exit(EXIT_SUCCESS); ++ } ++ + if (unit_mask) { + show_unit_mask(); + exit(EXIT_SUCCESS); +-- +1.8.3.1 + diff --git a/SOURCES/oprofile-order.patch b/SOURCES/oprofile-order.patch new file mode 100644 index 0000000..071590e --- /dev/null +++ b/SOURCES/oprofile-order.patch @@ -0,0 +1,59 @@ +From c95158840a7914d558a93b044c5ab0eeb0ea9337 Mon Sep 17 00:00:00 2001 +From: William Cohen +Date: Tue, 9 Aug 2016 22:25:52 -0400 +Subject: [PATCH] Only start the application if the perf events setup was + successful + +The code was starting the application before the performance events +were setup. In some cases the the setup of the perf events may fail +and the code needs to verify that the performance events have been +successfully set up before starting the application. Changed the +order of those steps to allow a check of the perf event setup before +launching the application. + +Signed-off-by: William Cohen +--- + pe_counting/ocount.cpp | 20 ++++++++++---------- + 1 file changed, 10 insertions(+), 10 deletions(-) + +diff --git a/pe_counting/ocount.cpp b/pe_counting/ocount.cpp +index 4d9c104..7717717 100644 +--- a/pe_counting/ocount.cpp ++++ b/pe_counting/ocount.cpp +@@ -257,16 +257,6 @@ bool start_counting(void) + proc_list = ocount_options::processes; + } + +- if (startApp) { +- // Tell app_PID to start the app +- cverb << vdebug << "telling child to start app" << endl; +- if (write(start_app_pipe[1], &startup, sizeof(startup)) < 0) { +- perror("Internal error on start_app_pipe"); +- return -1; +- } +- app_started = true; +- } +- + orecord = new ocount_record(runmode, events, ocount_options::display_interval ? true : false); + bool ret; + switch (runmode) { +@@ -300,6 +290,16 @@ bool start_counting(void) + ret = false; + } + ++ if (startApp && ret != false) { ++ // Tell app_PID to start the app ++ cverb << vdebug << "telling child to start app" << endl; ++ if (write(start_app_pipe[1], &startup, sizeof(startup)) < 0) { ++ perror("Internal error on start_app_pipe"); ++ return false; ++ } ++ app_started = true; ++ } ++ + return ret; + } + +-- +2.7.4 + diff --git a/SOURCES/oprofile-power8.patch b/SOURCES/oprofile-power8.patch new file mode 100644 index 0000000..91e6fb8 --- /dev/null +++ b/SOURCES/oprofile-power8.patch @@ -0,0 +1,1376 @@ +commit 3795ee4a10c11e16c8d13b5a5d7a6f10615a40d5 +Author: Maynard Johnson +Date: Wed Sep 25 11:15:30 2013 -0500 + + Add two new POWER8 events that are needed for stall analysis + + Signed-off-by: Maynard Johnson + +diff --git a/events/ppc64/power8/events b/events/ppc64/power8/events +index 994dc27..9c96949 100644 +--- a/events/ppc64/power8/events ++++ b/events/ppc64/power8/events +@@ -54,6 +54,7 @@ event:0x3e050 counters:2 um:zero minimum:10000 name:PM_DC_PREF_STREAM_STRIDED_CO + event:0x4d01e counters:3 um:zero minimum:10000 name:PM_GCT_NOSLOT_BR_MPRED : Gct empty fo this thread due to branch mispred. + event:0x4d01a counters:3 um:zero minimum:10000 name:PM_GCT_NOSLOT_BR_MPRED_ICMISS : Gct empty fo this thread due to Icache Miss and branch mispred. + event:0x2d01e counters:1 um:zero minimum:10000 name:PM_GCT_NOSLOT_DISP_HELD_ISSQ : Gct empty fo this thread due to dispatch hold on this thread due to Issue q full. ++event:0x4d01c counters:3 um:zero minimum:10000 name:PM_GCT_NOSLOT_DISP_HELD_MAP : Gct empty fo this thread due to dispatch hold on this thread due to Mapper full. + event:0x2e010 counters:1 um:zero minimum:10000 name:PM_GCT_NOSLOT_DISP_HELD_OTHER : Gct empty fo this thread due to dispatch hold on this thread due to sync. + event:0x2d01c counters:1 um:zero minimum:10000 name:PM_GCT_NOSLOT_DISP_HELD_SRQ : Gct empty fo this thread due to dispatch hold on this thread due to SRQ full. + event:0x4e010 counters:3 um:zero minimum:10000 name:PM_GCT_NOSLOT_IC_L3MISS : Gct empty fo this thread due to icach l3 miss. +@@ -87,6 +88,7 @@ event:0x20114 counters:1 um:zero minimum:1000 name:PM_MRK_L2_RC_DISP : Marked In + event:0x4013e counters:3 um:zero minimum:1000 name:PM_MRK_LD_MISS_L1_CYC : Marked ld latency. + event:0x3013e counters:2 um:zero minimum:1000 name:PM_MRK_STALL_CMPLU_CYC : Marked Group Completion Stall cycles (use edge detect to count ). + event:0x3006e counters:2 um:zero minimum:10000 name:PM_NEST_REF_CLK : Nest reference clocks. ++event:0x2001a counters:1 um:zero minimum:10000 name:PM_NTCG_ALL_FIN : Cycles after all instructions have finished to group completed + event:0x20010 counters:1 um:zero minimum:10000 name:PM_PMC1_OVERFLOW : Overflow from counter 1. + event:0x30010 counters:2 um:zero minimum:10000 name:PM_PMC2_OVERFLOW : Overflow from counter 2. + event:0x40010 counters:3 um:zero minimum:10000 name:PM_PMC3_OVERFLOW : Overflow from counter 3. +commit 717d4595a0d60faffeb8b9611dda850e3f998ef8 +Author: Maynard Johnson +Date: Mon Feb 3 17:50:54 2014 -0600 + + Fix up event codes for marked architected events + + Fourteen events in the set of architected events had the wrong + event encoding. All 14 were "marked" events, used in random + sampling. + + Signed-off-by: Maynard Johnson + +diff --git a/events/ppc64/architected_events_v1/events b/events/ppc64/architected_events_v1/events +index 1048ec9..465cbbd 100644 +--- a/events/ppc64/architected_events_v1/events ++++ b/events/ppc64/architected_events_v1/events +@@ -30,20 +30,20 @@ event:0x300f6 counters:2 um:zero minimum:10000 name:PM_L1_DCACHE_RELOAD_VALID : + event:0x200fc counters:1 um:zero minimum:10000 name:PM_L1_ICACHE_MISS : Demand iCache Miss + event:0x400f0 counters:3 um:zero minimum:10000 name:PM_LD_MISS_L1 : Load Missed L1 + event:0x200f6 counters:1 um:zero minimum:10000 name:PM_LSU_DERAT_MISS : DERAT Reloaded due to a DERAT miss +-event:0x300e4 counters:2 um:zero minimum:1000 name:PM_MRK_BR_MPRED_CMPL : Marked Branch Mispredicted +-event:0x100e2 counters:0 um:zero minimum:1000 name:PM_MRK_BR_TAKEN_CMPL : Marked Branch Taken completed +-event:0x400e8 counters:3 um:zero minimum:1000 name:PM_MRK_DATA_FROM_L2MISS : sampled load resolved beyond L2 +-event:0x200e4 counters:1 um:zero minimum:1000 name:PM_MRK_DATA_FROM_L3MISS : sampled load resolved beyond L3 +-event:0x200e0 counters:1 um:zero minimum:1000 name:PM_MRK_DATA_FROM_MEM : sampled load resolved from memory +-event:0x300e6 counters:2 um:zero minimum:1000 name:PM_MRK_DERAT_MISS : Erat Miss (TLB Access) All page sizes +-event:0x400e4 counters:3 um:zero minimum:1000 name:PM_MRK_DTLB_MISS : sampled Instruction dtlb miss +-event:0x400e0 counters:3 um:zero minimum:1000 name:PM_MRK_INST_CMPL : Marked group complete +-event:0x100e0 counters:0 um:zero minimum:1000 name:PM_MRK_INST_DISP : The thread has dispatched a randomly sampled marked instruction +-event:0x400e6 counters:3 um:zero minimum:1000 name:PM_MRK_INST_FROM_L3MISS : sampled instruction missed icache and came from beyond L3 A Instruction cacheline request for a marked/sampled instruction resolved from a location that was beyond the local L3 cache +-event:0x100e4 counters:0 um:zero minimum:1000 name:PM_MRK_L1_ICACHE_MISS : sampled Instruction suffered an icache Miss +-event:0x100ea counters:0 um:zero minimum:1000 name:PM_MRK_L1_RELOAD_VALID : Sampled Instruction had a data reload +-event:0x200e2 counters:1 um:zero minimum:1000 name:PM_MRK_LD_MISS_L1 : Marked DL1 Demand Miss +-event:0x300e2 counters:2 um:zero minimum:1000 name:PM_MRK_ST_CMPL : marked store completed and sent to nest ++event:0x301e4 counters:2 um:zero minimum:1000 name:PM_MRK_BR_MPRED_CMPL : Marked Branch Mispredicted ++event:0x101e2 counters:0 um:zero minimum:1000 name:PM_MRK_BR_TAKEN_CMPL : Marked Branch Taken completed ++event:0x401e8 counters:3 um:zero minimum:1000 name:PM_MRK_DATA_FROM_L2MISS : sampled load resolved beyond L2 ++event:0x201e4 counters:1 um:zero minimum:1000 name:PM_MRK_DATA_FROM_L3MISS : sampled load resolved beyond L3 ++event:0x201e0 counters:1 um:zero minimum:1000 name:PM_MRK_DATA_FROM_MEM : sampled load resolved from memory ++event:0x301e6 counters:2 um:zero minimum:1000 name:PM_MRK_DERAT_MISS : Erat Miss (TLB Access) All page sizes ++event:0x401e4 counters:3 um:zero minimum:1000 name:PM_MRK_DTLB_MISS : sampled Instruction dtlb miss ++event:0x401e0 counters:3 um:zero minimum:1000 name:PM_MRK_INST_CMPL : Marked group complete ++event:0x101e0 counters:0 um:zero minimum:1000 name:PM_MRK_INST_DISP : The thread has dispatched a randomly sampled marked instruction ++event:0x401e6 counters:3 um:zero minimum:1000 name:PM_MRK_INST_FROM_L3MISS : sampled instruction missed icache and came from beyond L3 A Instruction cacheline request for a marked/sampled instruction resolved from a location that was beyond the local L3 cache ++event:0x101e4 counters:0 um:zero minimum:1000 name:PM_MRK_L1_ICACHE_MISS : sampled Instruction suffered an icache Miss ++event:0x101ea counters:0 um:zero minimum:1000 name:PM_MRK_L1_RELOAD_VALID : Sampled Instruction had a data reload ++event:0x201e2 counters:1 um:zero minimum:1000 name:PM_MRK_LD_MISS_L1 : Marked DL1 Demand Miss ++event:0x301e2 counters:2 um:zero minimum:1000 name:PM_MRK_ST_CMPL : marked store completed and sent to nest + event:0x600f4 counters:5 um:zero minimum:100000 name:PM_RUN_CYC : Run_cycles + event:0x500fa counters:4 um:zero minimum:100000 name:PM_RUN_INST_CMPL : Run_Instructions + event:0x400f4 counters:3 um:zero minimum:10000 name:PM_RUN_PURR : Run_PURR +commit 029735879c7ff3ec23aa97dec5ffd95867836cdb +Author: Maynard Johnson +Date: Fri Feb 7 08:58:28 2014 -0600 + + Fix various event names and codes for IBM architected and POWER8 events + + Signed-off-by: Maynard Johnson + +diff --git a/events/ppc64/architected_events_v1/events b/events/ppc64/architected_events_v1/events +index 465cbbd..f8a9efb 100644 +--- a/events/ppc64/architected_events_v1/events ++++ b/events/ppc64/architected_events_v1/events +@@ -8,55 +8,55 @@ + # Manually add CYCLES for backward compatibility for default event + event:0x100f0 counters:0 um:zero minimum:100000 name:CYCLES : Cycles + +-event:0x100f2 counters:0 um:zero minimum:100000 name:PM_1PLUS_PPC_CMPL : one or more ppc instructions finished +-event:0x400f2 counters:3 um:zero minimum:100000 name:PM_1PLUS_PPC_DISP : Cycles at least one Instr Dispatched +-event:0x100fa counters:0 um:zero minimum:100000 name:PM_ANY_THRD_RUN_CYC : One of threads in run_cycles +-event:0x400f6 counters:3 um:zero minimum:10000 name:PM_BR_MPRED_CMPL : Number of Branch Mispredicts +-event:0x200fa counters:1 um:zero minimum:10000 name:PM_BR_TAKEN_CMPL : New event for Branch Taken +-event:0x100f0 counters:0 um:zero minimum:100000 name:PM_CYC : Cycles +-event:0x200fe counters:1 um:zero minimum:10000 name:PM_DATA_FROM_L2MISS : Demand LD - L2 Miss (not L2 hit) +-event:0x300fe counters:2 um:zero minimum:10000 name:PM_DATA_FROM_L3MISS : Demand LD - L3 Miss (not L2 hit and not L3 hit) +-event:0x400fe counters:3 um:zero minimum:10000 name:PM_DATA_FROM_MEM : data from Memory +-event:0x300fc counters:2 um:zero minimum:10000 name:PM_DTLB_MISS : Data PTEG reload +-event:0x200f8 counters:1 um:zero minimum:10000 name:PM_EXT_INT : external interrupt +-event:0x100f4 counters:0 um:zero minimum:10000 name:PM_FLOP : Floating Point Operations Finished +-event:0x400f8 counters:3 um:zero minimum:10000 name:PM_FLUSH : Flush (any type) +-event:0x100f8 counters:0 um:zero minimum:10000 name:PM_GCT_NOSLOT_CYC : No itags assigned +-event:0x100f6 counters:0 um:zero minimum:10000 name:PM_IERAT_MISS : Cycles Instruction ERAT was reloaded +-event:0x200f2 counters:1 um:zero minimum:100000 name:PM_INST_DISP : Number of PPC Dispatched +-event:0x300fa counters:2 um:zero minimum:10000 name:PM_INST_FROM_L3MISS : A Instruction cacheline request resolved from a location that was beyond the local L3 cache +-event:0x400fc counters:3 um:zero minimum:10000 name:PM_ITLB_MISS : ITLB Reloaded (always zero on POWER6) +-event:0x300f6 counters:2 um:zero minimum:10000 name:PM_L1_DCACHE_RELOAD_VALID : DL1 reloaded due to Demand Load +-event:0x200fc counters:1 um:zero minimum:10000 name:PM_L1_ICACHE_MISS : Demand iCache Miss +-event:0x400f0 counters:3 um:zero minimum:10000 name:PM_LD_MISS_L1 : Load Missed L1 +-event:0x200f6 counters:1 um:zero minimum:10000 name:PM_LSU_DERAT_MISS : DERAT Reloaded due to a DERAT miss +-event:0x301e4 counters:2 um:zero minimum:1000 name:PM_MRK_BR_MPRED_CMPL : Marked Branch Mispredicted +-event:0x101e2 counters:0 um:zero minimum:1000 name:PM_MRK_BR_TAKEN_CMPL : Marked Branch Taken completed +-event:0x401e8 counters:3 um:zero minimum:1000 name:PM_MRK_DATA_FROM_L2MISS : sampled load resolved beyond L2 +-event:0x201e4 counters:1 um:zero minimum:1000 name:PM_MRK_DATA_FROM_L3MISS : sampled load resolved beyond L3 +-event:0x201e0 counters:1 um:zero minimum:1000 name:PM_MRK_DATA_FROM_MEM : sampled load resolved from memory +-event:0x301e6 counters:2 um:zero minimum:1000 name:PM_MRK_DERAT_MISS : Erat Miss (TLB Access) All page sizes +-event:0x401e4 counters:3 um:zero minimum:1000 name:PM_MRK_DTLB_MISS : sampled Instruction dtlb miss +-event:0x401e0 counters:3 um:zero minimum:1000 name:PM_MRK_INST_CMPL : Marked group complete +-event:0x101e0 counters:0 um:zero minimum:1000 name:PM_MRK_INST_DISP : The thread has dispatched a randomly sampled marked instruction +-event:0x401e6 counters:3 um:zero minimum:1000 name:PM_MRK_INST_FROM_L3MISS : sampled instruction missed icache and came from beyond L3 A Instruction cacheline request for a marked/sampled instruction resolved from a location that was beyond the local L3 cache +-event:0x101e4 counters:0 um:zero minimum:1000 name:PM_MRK_L1_ICACHE_MISS : sampled Instruction suffered an icache Miss +-event:0x101ea counters:0 um:zero minimum:1000 name:PM_MRK_L1_RELOAD_VALID : Sampled Instruction had a data reload +-event:0x201e2 counters:1 um:zero minimum:1000 name:PM_MRK_LD_MISS_L1 : Marked DL1 Demand Miss +-event:0x301e2 counters:2 um:zero minimum:1000 name:PM_MRK_ST_CMPL : marked store completed and sent to nest +-event:0x600f4 counters:5 um:zero minimum:100000 name:PM_RUN_CYC : Run_cycles +-event:0x500fa counters:4 um:zero minimum:100000 name:PM_RUN_INST_CMPL : Run_Instructions +-event:0x400f4 counters:3 um:zero minimum:10000 name:PM_RUN_PURR : Run_PURR +-event:0x200f0 counters:1 um:zero minimum:10000 name:PM_ST_FIN : Store Instructions Finished +-event:0x300f0 counters:2 um:zero minimum:10000 name:PM_ST_MISS_L1 : Store Missed L1 +-event:0x300f8 counters:2 um:zero minimum:10000 name:PM_TB_BIT_TRANS : timebase event +-event:0x300f4 counters:2 um:zero minimum:100000 name:PM_THRD_CONC_RUN_INST : PPC Instructions Finished when both threads in run_cycles +-event:0x300ea counters:2 um:zero minimum:10000 name:PM_THRESH_EXC_1024 : Threshold counter exceeded a value of 1024 Architecture provides a thresholding counter in MMCRA, it has a start and stop events to configure and a programmable threshold, this event increments when the threshold exceeded a count of 1024 +-event:0x400ea counters:3 um:zero minimum:10000 name:PM_THRESH_EXC_128 : Architecture provides a thresholding counter in MMCRA, it has a start and stop events to configure and a programmable threshold, this event increments when the threshold exceeded a count of 128 +-event:0x400ec counters:3 um:zero minimum:10000 name:PM_THRESH_EXC_2048 : Architecture provides a thresholding counter in MMCRA, it has a start and stop events to configure and a programmable threshold, this event increments when the threshold exceeded a count of 2048 +-event:0x100e8 counters:0 um:zero minimum:10000 name:PM_THRESH_EXC_256 : Architecture provides a thresholding counter in MMCRA, it has a start and stop events to configure and a programmable threshold, this event increments when the threshold exceeded a count of 256 +-event:0x200e6 counters:1 um:zero minimum:10000 name:PM_THRESH_EXC_32 : Architecture provides a thresholding counter in MMCRA, it has a start and stop events to configure and a programmable threshold, this event increments when the threshold exceeded a count of 32 +-event:0x100e6 counters:0 um:zero minimum:10000 name:PM_THRESH_EXC_4096 : Architecture provides a thresholding counter in MMCRA, it has a start and stop events to configure and a programmable threshold, this event increments when the threshold exceeded a count of 4096 +-event:0x200e8 counters:1 um:zero minimum:10000 name:PM_THRESH_EXC_512 : Threshold counter exceeded a value of 512 Architecture provides a thresholding counter in MMCRA, it has a start and stop events to configure and a programmable threshold, this event increments when the threshold exceeded a count of 512 +-event:0x300e8 counters:2 um:zero minimum:10000 name:PM_THRESH_EXC_64 : Threshold counter exceeded a value of 64 Architecture provides a thresholding counter in MMCRA, it has a start and stop events to configure and a programmable threshold, this event increments when the threshold exceeded a count of 64 +-event:0x100ec counters:0 um:zero minimum:10000 name:PM_THRESH_MET : Threshold exceeded ++event:0x100f2 counters:0 um:zero minimum:100000 name:PM_1PLUS_PPC_CMPL : 1 or more ppc insts finished (completed). ++event:0x400f2 counters:3 um:zero minimum:100000 name:PM_1PLUS_PPC_DISP : Cycles at least one Instr Dispatched. Could be a group with only microcode. Issue HW016521 ++event:0x100fa counters:0 um:zero minimum:100000 name:PM_ANY_THRD_RUN_CYC : Any thread in run_cycles (was one thread in run_cycles). ++event:0x400f6 counters:3 um:zero minimum:10000 name:PM_BR_MPRED_CMPL : Number of Branch Mispredicts. ++event:0x200fa counters:1 um:zero minimum:10000 name:PM_BR_TAKEN_CMPL : Branch Taken. ++event:0x1e counters:0,1,2,3 um:zero minimum:100000 name:PM_CYC : Cycles. ++event:0x200fe counters:1 um:zero minimum:10000 name:PM_DATA_FROM_L2MISS : Demand LD - L2 Miss (not L2 hit). ++event:0x300fe counters:2 um:zero minimum:10000 name:PM_DATA_FROM_L3MISS : Demand LD - L3 Miss (not L2 hit and not L3 hit). ++event:0x400fe counters:3 um:zero minimum:10000 name:PM_DATA_FROM_MEM : Data cache reload from memory (including L4). ++event:0x300fc counters:2 um:zero minimum:10000 name:PM_DTLB_MISS : Data PTEG Reloaded (DTLB Miss). ++event:0x200f8 counters:1 um:zero minimum:10000 name:PM_EXT_INT : external interrupt. ++event:0x100f4 counters:0 um:zero minimum:10000 name:PM_FLOP : Floating Point Operations Finished. ++event:0x400f8 counters:3 um:zero minimum:10000 name:PM_FLUSH : Flush (any type). ++event:0x100f8 counters:0 um:zero minimum:10000 name:PM_GCT_NOSLOT_CYC : Pipeline empty (No itags assigned , no GCT slots used). ++event:0x100f6 counters:0 um:zero minimum:10000 name:PM_IERAT_RELOAD : IERAT Reloaded (Miss). ++event:0x200f2 counters:1 um:zero minimum:100000 name:PM_INST_DISP : PPC Dispatched. ++event:0x300fa counters:2 um:zero minimum:10000 name:PM_INST_FROM_L3MISS : Inst from L3 miss. ++event:0x400fc counters:3 um:zero minimum:10000 name:PM_ITLB_MISS : ITLB Reloaded. ++event:0x300f6 counters:2 um:zero minimum:10000 name:PM_L1_DCACHE_RELOAD_VALID : DL1 reloaded due to Demand Load . ++event:0x200fd counters:1 um:zero minimum:10000 name:PM_L1_ICACHE_MISS : Demand iCache Miss. ++event:0x3e054 counters:2 um:zero minimum:10000 name:PM_LD_MISS_L1 : Load Missed L1. ++event:0x200f6 counters:1 um:zero minimum:10000 name:PM_LSU_DERAT_MISS : DERAT Reloaded (Miss). ++event:0x301e4 counters:2 um:zero minimum:1000 name:PM_MRK_BR_MPRED_CMPL : Marked Branch Mispredicted. ++event:0x101e2 counters:0 um:zero minimum:1000 name:PM_MRK_BR_TAKEN_CMPL : Marked Branch Taken. ++event:0x401e8 counters:3 um:zero minimum:1000 name:PM_MRK_DATA_FROM_L2MISS : Data cache reload L2 miss. ++event:0x201e4 counters:1 um:zero minimum:1000 name:PM_MRK_DATA_FROM_L3MISS : The processor's data cache was reloaded from a localtion other than the local core's L3 due to a marked load. ++event:0x201e0 counters:1 um:zero minimum:1000 name:PM_MRK_DATA_FROM_MEM : The processor's data cache was reloaded from a memory location including L4 from local remote or distant due to a marked load. ++event:0x301e6 counters:2 um:zero minimum:1000 name:PM_MRK_DERAT_MISS : Erat Miss (TLB Access) All page sizes. ++event:0x401e4 counters:3 um:zero minimum:1000 name:PM_MRK_DTLB_MISS : Marked dtlb miss. ++event:0x401e0 counters:3 um:zero minimum:1000 name:PM_MRK_INST_CMPL : marked instruction completed. ++event:0x101e0 counters:0 um:zero minimum:1000 name:PM_MRK_INST_DISP : Marked Instruction dispatched. ++event:0x401e6 counters:3 um:zero minimum:1000 name:PM_MRK_INST_FROM_L3MISS : n/a ++event:0x101e4 counters:0 um:zero minimum:1000 name:PM_MRK_L1_ICACHE_MISS : Marked L1 Icache Miss. ++event:0x101ea counters:0 um:zero minimum:1000 name:PM_MRK_L1_RELOAD_VALID : Marked demand reload. ++event:0x201e2 counters:1 um:zero minimum:1000 name:PM_MRK_LD_MISS_L1 : Marked DL1 Demand Miss counted at exec time. ++event:0x10134 counters:0 um:zero minimum:1000 name:PM_MRK_ST_CMPL : Marked store completed. ++event:0x60005 counters:5 um:zero minimum:100000 name:PM_RUN_CYC : Run_cycles. ++event:0x50009 counters:4 um:zero minimum:100000 name:PM_RUN_INST_CMPL : Run_Instructions. ++event:0x400f4 counters:3 um:zero minimum:10000 name:PM_RUN_PURR : Run_PURR. ++event:0x200f0 counters:1 um:zero minimum:10000 name:PM_ST_FIN : Store Instructions Finished (store sent to nest). ++event:0x300f0 counters:2 um:zero minimum:10000 name:PM_ST_MISS_L1 : Store Missed L1. ++event:0x300f8 counters:2 um:zero minimum:10000 name:PM_TB_BIT_TRANS : timebase event. ++event:0x300f4 counters:2 um:zero minimum:100000 name:PM_THRD_CONC_RUN_INST : Concurrent Run Instructions. ++event:0x301ea counters:2 um:zero minimum:1000 name:PM_THRESH_EXC_1024 : Reload latency exceeded 1024 cyc ++event:0x401ea counters:3 um:zero minimum:1000 name:PM_THRESH_EXC_128 : Threshold counter exceeded a value of 128. ++event:0x401ec counters:3 um:zero minimum:1000 name:PM_THRESH_EXC_2048 : Threshold counter exceeded a value of 2048 ++event:0x101e8 counters:0 um:zero minimum:1000 name:PM_THRESH_EXC_256 : Threshold counter exceed a count of 256. ++event:0x201e6 counters:1 um:zero minimum:1000 name:PM_THRESH_EXC_32 : Threshold counter exceeded a value of 32. ++event:0x101e6 counters:0 um:zero minimum:1000 name:PM_THRESH_EXC_4096 : Threshold counter exceed a count of 4096. ++event:0x201e8 counters:1 um:zero minimum:1000 name:PM_THRESH_EXC_512 : Threshold counter exceeded a value of 512. ++event:0x301e8 counters:2 um:zero minimum:1000 name:PM_THRESH_EXC_64 : Threshold counter exceeded a value of 64. ++event:0x101ec counters:0 um:zero minimum:10000 name:PM_THRESH_MET : threshold exceeded. +diff --git a/events/ppc64/power8/events b/events/ppc64/power8/events +index 9c96949..54430b4 100644 +--- a/events/ppc64/power8/events ++++ b/events/ppc64/power8/events +@@ -10,7 +10,7 @@ include:ppc64/architected_events_v1 + event:0x40036 counters:3 um:zero minimum:10000 name:PM_BR_2PATH : two path branch. + event:0x40060 counters:3 um:zero minimum:10000 name:PM_BR_CMPL : Branch Instruction completed. + event:0x40138 counters:3 um:zero minimum:10000 name:PM_BR_MRK_2PATH : marked two path branch. +-event:0x1e054 counters:0 um:zero minimum:10000 name:PM_CMPLU_STALL : Completion stall. ++event:0x4000a counters:3 um:zero minimum:10000 name:PM_CMPLU_STALL : Completion stall. + event:0x4d018 counters:3 um:zero minimum:10000 name:PM_CMPLU_STALL_BRU : Completion stall due to a Branch Unit. + event:0x2d018 counters:1 um:zero minimum:10000 name:PM_CMPLU_STALL_BRU_CRU : Completion stall due to IFU. + event:0x30026 counters:2 um:zero minimum:10000 name:PM_CMPLU_STALL_COQ_FULL : Completion stall due to CO q full. +@@ -30,6 +30,7 @@ event:0x4d014 counters:3 um:zero minimum:10000 name:PM_CMPLU_STALL_LOAD_FINISH : + event:0x2c010 counters:1 um:zero minimum:10000 name:PM_CMPLU_STALL_LSU : Completion stall by LSU instruction. + event:0x10036 counters:0 um:zero minimum:10000 name:PM_CMPLU_STALL_LWSYNC : completion stall due to isync/lwsync. + event:0x30028 counters:2 um:zero minimum:10000 name:PM_CMPLU_STALL_MEM_ECC_DELAY : Completion stall due to mem ECC delay. ++event:0x2e01c counters:1 um:zero minimum:10000 name:PM_CMPLU_STALL_NO_NTF : Completion stall due to nop + event:0x2e01e counters:1 um:zero minimum:10000 name:PM_CMPLU_STALL_NTCG_FLUSH : Completion stall due to reject (load hit store). + event:0x30006 counters:2 um:zero minimum:10000 name:PM_CMPLU_STALL_OTHER_CMPL : Instructions core completed while this thread was stalled. + event:0x4c010 counters:3 um:zero minimum:10000 name:PM_CMPLU_STALL_REJECT : Completion stall due to LSU reject. +@@ -62,7 +63,7 @@ event:0x2d01a counters:1 um:zero minimum:10000 name:PM_GCT_NOSLOT_IC_MISS : Gct + event:0x3000a counters:2 um:zero minimum:100000 name:PM_GRP_DISP : dispatch_success (Group Dispatched). + event:0x10130 counters:0 um:zero minimum:10000 name:PM_GRP_MRK : Instruction marked in idu. + event:0x2000a counters:1 um:zero minimum:10000 name:PM_HV_CYC : cycles in hypervisor mode . +-event:0x10002 counters:0 um:zero minimum:100000 name:PM_INST_CMPL : PPC Instructions Finished (completed). ++event:0x2 counters:0,1,2,3 um:zero minimum:100000 name:PM_INST_CMPL : PPC Instructions Finished (completed). + event:0x10014 counters:0 um:zero minimum:100000 name:PM_IOPS_CMPL : IOPS Completed. + event:0x1002e counters:0 um:zero minimum:10000 name:PM_LD_CMPL : count of Loads completed. + event:0x10062 counters:0 um:zero minimum:10000 name:PM_LD_L3MISS_PEND_CYC : Cycles L3 miss was pending for this thread. +@@ -86,14 +87,13 @@ event:0x40130 counters:3 um:zero minimum:1000 name:PM_MRK_GRP_CMPL : marked inst + event:0x20130 counters:1 um:zero minimum:1000 name:PM_MRK_INST_DECODED : marked instruction decoded. Name from ISU? + event:0x20114 counters:1 um:zero minimum:1000 name:PM_MRK_L2_RC_DISP : Marked Instruction RC dispatched in L2. + event:0x4013e counters:3 um:zero minimum:1000 name:PM_MRK_LD_MISS_L1_CYC : Marked ld latency. +-event:0x3013e counters:2 um:zero minimum:1000 name:PM_MRK_STALL_CMPLU_CYC : Marked Group Completion Stall cycles (use edge detect to count ). ++event:0x3013e counters:2 um:zero minimum:1000 name:PM_MRK_STALL_CMPLU_CYC : Marked Group Completion Stall cycles (use edge detect to count #). + event:0x3006e counters:2 um:zero minimum:10000 name:PM_NEST_REF_CLK : Nest reference clocks. +-event:0x2001a counters:1 um:zero minimum:10000 name:PM_NTCG_ALL_FIN : Cycles after all instructions have finished to group completed ++event:0x2001a counters:1 um:zero minimum:10000 name:PM_NTCG_ALL_FIN : Cycles after all instructions have finished to group completed. + event:0x20010 counters:1 um:zero minimum:10000 name:PM_PMC1_OVERFLOW : Overflow from counter 1. + event:0x30010 counters:2 um:zero minimum:10000 name:PM_PMC2_OVERFLOW : Overflow from counter 2. + event:0x40010 counters:3 um:zero minimum:10000 name:PM_PMC3_OVERFLOW : Overflow from counter 3. + event:0x10010 counters:0 um:zero minimum:10000 name:PM_PMC4_OVERFLOW : Overflow from counter 4. + event:0x30024 counters:2 um:zero minimum:10000 name:PM_PMC6_OVERFLOW : Overflow from counter 6. +-event:0x40002 counters:3 um:zero minimum:10000 name:PM_PPC_CMPL : PPC Instructions Finished (completed). + event:0x2000c counters:1 um:zero minimum:100000 name:PM_THRD_ALL_RUN_CYC : All Threads in Run_cycles (was both threads in run_cycles). + event:0x4016e counters:3 um:zero minimum:10000 name:PM_THRESH_NOT_MET : Threshold counter did not meet threshold. +commit 31389d9cf7c0946479065e0baf0efd52cc4ba1f4 +Author: Maynard Johnson +Date: Fri Feb 7 10:27:46 2014 -0600 + + Fix PM_RUN_CYC and PM_RUN_INST_CMPL event codes broken by previous commit + + Signed-off-by: Maynard Johnson + +diff --git a/events/ppc64/architected_events_v1/events b/events/ppc64/architected_events_v1/events +index f8a9efb..fad6ca5 100644 +--- a/events/ppc64/architected_events_v1/events ++++ b/events/ppc64/architected_events_v1/events +@@ -44,8 +44,8 @@ event:0x101e4 counters:0 um:zero minimum:1000 name:PM_MRK_L1_ICACHE_MISS : Marke + event:0x101ea counters:0 um:zero minimum:1000 name:PM_MRK_L1_RELOAD_VALID : Marked demand reload. + event:0x201e2 counters:1 um:zero minimum:1000 name:PM_MRK_LD_MISS_L1 : Marked DL1 Demand Miss counted at exec time. + event:0x10134 counters:0 um:zero minimum:1000 name:PM_MRK_ST_CMPL : Marked store completed. +-event:0x60005 counters:5 um:zero minimum:100000 name:PM_RUN_CYC : Run_cycles. +-event:0x50009 counters:4 um:zero minimum:100000 name:PM_RUN_INST_CMPL : Run_Instructions. ++event:0x600f4 counters:5 um:zero minimum:100000 name:PM_RUN_CYC : Run_cycles. ++event:0x500fa counters:4 um:zero minimum:100000 name:PM_RUN_INST_CMPL : Run_Instructions. + event:0x400f4 counters:3 um:zero minimum:10000 name:PM_RUN_PURR : Run_PURR. + event:0x200f0 counters:1 um:zero minimum:10000 name:PM_ST_FIN : Store Instructions Finished (store sent to nest). + event:0x300f0 counters:2 um:zero minimum:10000 name:PM_ST_MISS_L1 : Store Missed L1. +commit f72665b5f28f0d098a985f29672823158c7e85d9 +Author: Maynard Johnson +Date: Wed May 14 13:50:12 2014 -0500 + + Update events for IBM POWER8 processor + + The initial support for the IBM POWER8 processor was added to oprofile in + May 2013. Some events were held back as their descriptions may have exposed + information about the POWER8 architecture that IBM wanted to remain private + until the official announcement. Some other events were held back because they + had not yet been verified. The POWER8 has now been announced and all events + have been verified, so we can now publish all events. + + Signed-off-by: Maynard Johnson + +diff --git a/events/ppc64/architected_events_v1/events b/events/ppc64/architected_events_v1/events +index fad6ca5..a52d9ee 100644 +--- a/events/ppc64/architected_events_v1/events ++++ b/events/ppc64/architected_events_v1/events +@@ -8,32 +8,32 @@ + # Manually add CYCLES for backward compatibility for default event + event:0x100f0 counters:0 um:zero minimum:100000 name:CYCLES : Cycles + +-event:0x100f2 counters:0 um:zero minimum:100000 name:PM_1PLUS_PPC_CMPL : 1 or more ppc insts finished (completed). ++event:0x100f2 counters:0 um:zero minimum:100000 name:PM_1PLUS_PPC_CMPL : 1 or more ppc insts finished (completed). + event:0x400f2 counters:3 um:zero minimum:100000 name:PM_1PLUS_PPC_DISP : Cycles at least one Instr Dispatched. Could be a group with only microcode. Issue HW016521 +-event:0x100fa counters:0 um:zero minimum:100000 name:PM_ANY_THRD_RUN_CYC : Any thread in run_cycles (was one thread in run_cycles). ++event:0x100fa counters:0 um:zero minimum:100000 name:PM_ANY_THRD_RUN_CYC : Any thread in run_cycles (was one thread in run_cycles). + event:0x400f6 counters:3 um:zero minimum:10000 name:PM_BR_MPRED_CMPL : Number of Branch Mispredicts. + event:0x200fa counters:1 um:zero minimum:10000 name:PM_BR_TAKEN_CMPL : Branch Taken. +-event:0x1e counters:0,1,2,3 um:zero minimum:100000 name:PM_CYC : Cycles. ++event:0x1e counters:0,1,2,3 um:zero minimum:100000 name:PM_CYC : Cycles . + event:0x200fe counters:1 um:zero minimum:10000 name:PM_DATA_FROM_L2MISS : Demand LD - L2 Miss (not L2 hit). + event:0x300fe counters:2 um:zero minimum:10000 name:PM_DATA_FROM_L3MISS : Demand LD - L3 Miss (not L2 hit and not L3 hit). + event:0x400fe counters:3 um:zero minimum:10000 name:PM_DATA_FROM_MEM : Data cache reload from memory (including L4). +-event:0x300fc counters:2 um:zero minimum:10000 name:PM_DTLB_MISS : Data PTEG Reloaded (DTLB Miss). ++event:0x300fc counters:2 um:zero minimum:10000 name:PM_DTLB_MISS : Data PTEG Reloaded (DTLB Miss). + event:0x200f8 counters:1 um:zero minimum:10000 name:PM_EXT_INT : external interrupt. + event:0x100f4 counters:0 um:zero minimum:10000 name:PM_FLOP : Floating Point Operations Finished. + event:0x400f8 counters:3 um:zero minimum:10000 name:PM_FLUSH : Flush (any type). + event:0x100f8 counters:0 um:zero minimum:10000 name:PM_GCT_NOSLOT_CYC : Pipeline empty (No itags assigned , no GCT slots used). +-event:0x100f6 counters:0 um:zero minimum:10000 name:PM_IERAT_RELOAD : IERAT Reloaded (Miss). ++event:0x100f6 counters:0 um:zero minimum:10000 name:PM_IERAT_RELOAD : IERAT Reloaded (Miss). + event:0x200f2 counters:1 um:zero minimum:100000 name:PM_INST_DISP : PPC Dispatched. + event:0x300fa counters:2 um:zero minimum:10000 name:PM_INST_FROM_L3MISS : Inst from L3 miss. + event:0x400fc counters:3 um:zero minimum:10000 name:PM_ITLB_MISS : ITLB Reloaded. +-event:0x300f6 counters:2 um:zero minimum:10000 name:PM_L1_DCACHE_RELOAD_VALID : DL1 reloaded due to Demand Load . ++event:0x300f6 counters:2 um:zero minimum:10000 name:PM_L1_DCACHE_RELOAD_VALID : DL1 reloaded due to Demand Load . + event:0x200fd counters:1 um:zero minimum:10000 name:PM_L1_ICACHE_MISS : Demand iCache Miss. + event:0x3e054 counters:2 um:zero minimum:10000 name:PM_LD_MISS_L1 : Load Missed L1. +-event:0x200f6 counters:1 um:zero minimum:10000 name:PM_LSU_DERAT_MISS : DERAT Reloaded (Miss). ++event:0x200f6 counters:1 um:zero minimum:10000 name:PM_LSU_DERAT_MISS : DERAT Reloaded (Miss). + event:0x301e4 counters:2 um:zero minimum:1000 name:PM_MRK_BR_MPRED_CMPL : Marked Branch Mispredicted. + event:0x101e2 counters:0 um:zero minimum:1000 name:PM_MRK_BR_TAKEN_CMPL : Marked Branch Taken. + event:0x401e8 counters:3 um:zero minimum:1000 name:PM_MRK_DATA_FROM_L2MISS : Data cache reload L2 miss. +-event:0x201e4 counters:1 um:zero minimum:1000 name:PM_MRK_DATA_FROM_L3MISS : The processor's data cache was reloaded from a localtion other than the local core's L3 due to a marked load. ++event:0x201e4 counters:1 um:zero minimum:1000 name:PM_MRK_DATA_FROM_L3MISS : The processor's data cache was reloaded from a localtion other than the local core's L3 due to a marked load. + event:0x201e0 counters:1 um:zero minimum:1000 name:PM_MRK_DATA_FROM_MEM : The processor's data cache was reloaded from a memory location including L4 from local remote or distant due to a marked load. + event:0x301e6 counters:2 um:zero minimum:1000 name:PM_MRK_DERAT_MISS : Erat Miss (TLB Access) All page sizes. + event:0x401e4 counters:3 um:zero minimum:1000 name:PM_MRK_DTLB_MISS : Marked dtlb miss. +@@ -51,9 +51,9 @@ event:0x200f0 counters:1 um:zero minimum:10000 name:PM_ST_FIN : Store Instructio + event:0x300f0 counters:2 um:zero minimum:10000 name:PM_ST_MISS_L1 : Store Missed L1. + event:0x300f8 counters:2 um:zero minimum:10000 name:PM_TB_BIT_TRANS : timebase event. + event:0x300f4 counters:2 um:zero minimum:100000 name:PM_THRD_CONC_RUN_INST : Concurrent Run Instructions. +-event:0x301ea counters:2 um:zero minimum:1000 name:PM_THRESH_EXC_1024 : Reload latency exceeded 1024 cyc ++event:0x301ea counters:2 um:zero minimum:1000 name:PM_THRESH_EXC_1024 : Threshold counter exceeded a value of 1024. + event:0x401ea counters:3 um:zero minimum:1000 name:PM_THRESH_EXC_128 : Threshold counter exceeded a value of 128. +-event:0x401ec counters:3 um:zero minimum:1000 name:PM_THRESH_EXC_2048 : Threshold counter exceeded a value of 2048 ++event:0x401ec counters:3 um:zero minimum:1000 name:PM_THRESH_EXC_2048 : Threshold counter exceeded a value of 2048. + event:0x101e8 counters:0 um:zero minimum:1000 name:PM_THRESH_EXC_256 : Threshold counter exceed a count of 256. + event:0x201e6 counters:1 um:zero minimum:1000 name:PM_THRESH_EXC_32 : Threshold counter exceeded a value of 32. + event:0x101e6 counters:0 um:zero minimum:1000 name:PM_THRESH_EXC_4096 : Threshold counter exceed a count of 4096. +diff --git a/events/ppc64/power8/events b/events/ppc64/power8/events +index 54430b4..6e4e688 100644 +--- a/events/ppc64/power8/events ++++ b/events/ppc64/power8/events +@@ -7,9 +7,52 @@ + + include:ppc64/architected_events_v1 + +-event:0x40036 counters:3 um:zero minimum:10000 name:PM_BR_2PATH : two path branch. ++event:0x1f05e counters:0 um:zero minimum:100000 name:PM_1LPAR_CYC : Number of cycles in single lpar mode. ++event:0x2006e counters:1 um:zero minimum:10000 name:PM_2LPAR_CYC : Number of cycles in 2 lpar mode. ++event:0x4e05e counters:3 um:zero minimum:100000 name:PM_4LPAR_CYC : Number of cycles in 4 LPAR mode. ++event:0x610050 counters:0 um:zero minimum:10000 name:PM_ALL_CHIP_PUMP_CPRED : Initial and Final Pump Scope and data sourced across this scope was chip pump (prediction=correct) for all data types ( demand load,data,inst prefetch,inst fetch,xlate (I or d) ++event:0x520050 counters:1 um:zero minimum:10000 name:PM_ALL_GRP_PUMP_CPRED : Initial and Final Pump Scope and data sourced across this scope was group pump for all data types excluding data prefetch (demand load,inst prefetch,inst fetch,xlate) ++event:0x620052 counters:1 um:zero minimum:10000 name:PM_ALL_GRP_PUMP_MPRED : Final Pump Scope(Group) to get data sourced, ended up larger than Initial Pump Scope OR Final Pump Scope(Group) got data from source that was at smaller scope(Chip) Final pump was group pump and initial pump was chip or final and initial pump was gro ++event:0x610052 counters:0 um:zero minimum:10000 name:PM_ALL_GRP_PUMP_MPRED_RTY : Final Pump Scope(Group) to get data sourced, ended up larger than Initial Pump Scope (Chip) Final pump was group pump and initial pump was chip pumpfor all data types excluding data prefetch (demand load,inst prefetch,inst fetch,xlate) ++event:0x610054 counters:0 um:zero minimum:10000 name:PM_ALL_PUMP_CPRED : Pump prediction correct. Counts across all types of pumpsfor all data types excluding data prefetch (demand load,inst prefetch,inst fetch,xlate) ++event:0x640052 counters:3 um:zero minimum:10000 name:PM_ALL_PUMP_MPRED : Pump Mis prediction Counts across all types of pumpsfor all data types excluding data prefetch (demand load,inst prefetch,inst fetch,xlate) ++event:0x630050 counters:2 um:zero minimum:10000 name:PM_ALL_SYS_PUMP_CPRED : Initial and Final Pump Scope and data sourced across this scope was system pump for all data types excluding data prefetch (demand load,inst prefetch,inst fetch,xlate) ++event:0x630052 counters:2 um:zero minimum:10000 name:PM_ALL_SYS_PUMP_MPRED : Final Pump Scope(system) to get data sourced, ended up larger than Initial Pump Scope(Chip/Group) OR Final Pump Scope(system) got data from source that was at smaller scope(Chip/group) Final pump was system pump and initial pump was chip or group or ++event:0x640050 counters:3 um:zero minimum:10000 name:PM_ALL_SYS_PUMP_MPRED_RTY : Final Pump Scope(system) to get data sourced, ended up larger than Initial Pump Scope (Chip or Group) for all data types excluding data prefetch (demand load,inst prefetch,inst fetch,xlate) ++event:0x2505e counters:1 um:zero minimum:10000 name:PM_BACK_BR_CMPL : Branch instruction completed with a target address less than current instruction address. ++event:0x4082 counters:0,1,2,3 um:zero minimum:10000 name:PM_BANK_CONFLICT : Read blocked due to interleave conflict. The ifar logic will detect an interleave conflict and kill the data that was read that cycle. ++event:0x10068 counters:0 um:zero minimum:10000 name:PM_BRU_FIN : Branch Instruction Finished . ++event:0x20036 counters:1 um:zero minimum:10000 name:PM_BR_2PATH : two path branch. ++event:0x5086 counters:0,1,2,3 um:zero minimum:10000 name:PM_BR_BC_8 : Pairable BC+8 branch that has not been converted to a Resolve Finished in the BRU pipeline ++event:0x5084 counters:0,1,2,3 um:zero minimum:10000 name:PM_BR_BC_8_CONV : Pairable BC+8 branch that was converted to a Resolve Finished in the BRU pipeline. + event:0x40060 counters:3 um:zero minimum:10000 name:PM_BR_CMPL : Branch Instruction completed. +-event:0x40138 counters:3 um:zero minimum:10000 name:PM_BR_MRK_2PATH : marked two path branch. ++event:0x40ac counters:0,1,2,3 um:zero minimum:10000 name:PM_BR_MPRED_CCACHE : Conditional Branch Completed that was Mispredicted due to the Count Cache Target Prediction ++event:0x40b8 counters:0,1,2,3 um:zero minimum:10000 name:PM_BR_MPRED_CR : Conditional Branch Completed that was Mispredicted due to the BHT Direction Prediction (taken/not taken). ++event:0x40ae counters:0,1,2,3 um:zero minimum:10000 name:PM_BR_MPRED_LSTACK : Conditional Branch Completed that was Mispredicted due to the Link Stack Target Prediction ++event:0x40ba counters:0,1,2,3 um:zero minimum:10000 name:PM_BR_MPRED_TA : Conditional Branch Completed that was Mispredicted due to the Target Address Prediction from the Count Cache or Link Stack. Only XL-form branches that resolved Taken set this event. ++event:0x10138 counters:0 um:zero minimum:10000 name:PM_BR_MRK_2PATH : marked two path branch. ++event:0x409c counters:0,1,2,3 um:zero minimum:10000 name:PM_BR_PRED_BR0 : Conditional Branch Completed on BR0 (1st branch in group) in which the HW predicted the Direction or Target ++event:0x409e counters:0,1,2,3 um:zero minimum:10000 name:PM_BR_PRED_BR1 : Conditional Branch Completed on BR1 (2nd branch in group) in which the HW predicted the Direction or Target. Note: BR1 can only be used in Single Thread Mode. In all of the SMT modes, only one branch can complete, thus BR1 is unused. ++event:0x489c counters:0,1,2,3 um:zero minimum:10000 name:PM_BR_PRED_BR_CMPL : IFU ++event:0x40a4 counters:0,1,2,3 um:zero minimum:10000 name:PM_BR_PRED_CCACHE_BR0 : Conditional Branch Completed on BR0 that used the Count Cache for Target Prediction ++event:0x40a6 counters:0,1,2,3 um:zero minimum:10000 name:PM_BR_PRED_CCACHE_BR1 : Conditional Branch Completed on BR1 that used the Count Cache for Target Prediction ++event:0x48a4 counters:0,1,2,3 um:zero minimum:10000 name:PM_BR_PRED_CCACHE_CMPL : IFU ++event:0x40b0 counters:0,1,2,3 um:zero minimum:10000 name:PM_BR_PRED_CR_BR0 : Conditional Branch Completed on BR0 that had its direction predicted. I-form branches do not set this event. In addition, B-form branches which do not use the BHT do not set this event - these are branches with BO-field set to 'always taken' and bra ++event:0x40b2 counters:0,1,2,3 um:zero minimum:10000 name:PM_BR_PRED_CR_BR1 : Conditional Branch Completed on BR1 that had its direction predicted. I-form branches do not set this event. In addition, B-form branches which do not use the BHT do not set this event - these are branches with BO-field set to 'always taken' and bra ++event:0x48b0 counters:0,1,2,3 um:zero minimum:10000 name:PM_BR_PRED_CR_CMPL : IFU ++event:0x40a8 counters:0,1,2,3 um:zero minimum:10000 name:PM_BR_PRED_LSTACK_BR0 : Conditional Branch Completed on BR0 that used the Link Stack for Target Prediction ++event:0x40aa counters:0,1,2,3 um:zero minimum:10000 name:PM_BR_PRED_LSTACK_BR1 : Conditional Branch Completed on BR1 that used the Link Stack for Target Prediction ++event:0x48a8 counters:0,1,2,3 um:zero minimum:10000 name:PM_BR_PRED_LSTACK_CMPL : IFU ++event:0x40b4 counters:0,1,2,3 um:zero minimum:10000 name:PM_BR_PRED_TA_BR0 : Conditional Branch Completed on BR0 that had its target address predicted. Only XL-form branches set this event. ++event:0x40b6 counters:0,1,2,3 um:zero minimum:10000 name:PM_BR_PRED_TA_BR1 : Conditional Branch Completed on BR1 that had its target address predicted. Only XL-form branches set this event. ++event:0x48b4 counters:0,1,2,3 um:zero minimum:10000 name:PM_BR_PRED_TA_CMPL : IFU ++event:0x40a0 counters:0,1,2,3 um:zero minimum:10000 name:PM_BR_UNCOND_BR0 : Unconditional Branch Completed on BR0. HW branch prediction was not used for this branch. This can be an I-form branch, a B-form branch with BO-field set to branch always, or a B-form branch which was coverted to a Resolve. ++event:0x40a2 counters:0,1,2,3 um:zero minimum:10000 name:PM_BR_UNCOND_BR1 : Unconditional Branch Completed on BR1. HW branch prediction was not used for this branch. This can be an I-form branch, a B-form branch with BO-field set to branch always, or a B-form branch which was coverted to a Resolve. ++event:0x48a0 counters:0,1,2,3 um:zero minimum:10000 name:PM_BR_UNCOND_CMPL : IFU ++event:0x3094 counters:0,1,2,3 um:zero minimum:10000 name:PM_CASTOUT_ISSUED : Castouts issued ++event:0x3096 counters:0,1,2,3 um:zero minimum:10000 name:PM_CASTOUT_ISSUED_GPR : Castouts issued GPR ++event:0x10050 counters:0 um:zero minimum:10000 name:PM_CHIP_PUMP_CPRED : Initial and Final Pump Scope and data sourced across this scope was chip pump (prediction=correct) for all data types ( demand load,data,inst prefetch,inst fetch,xlate (I or d). ++event:0x2090 counters:0,1,2,3 um:zero minimum:10000 name:PM_CLB_HELD : CLB Hold: Any Reason + event:0x4000a counters:3 um:zero minimum:10000 name:PM_CMPLU_STALL : Completion stall. + event:0x4d018 counters:3 um:zero minimum:10000 name:PM_CMPLU_STALL_BRU : Completion stall due to a Branch Unit. + event:0x2d018 counters:1 um:zero minimum:10000 name:PM_CMPLU_STALL_BRU_CRU : Completion stall due to IFU. +@@ -30,7 +73,7 @@ event:0x4d014 counters:3 um:zero minimum:10000 name:PM_CMPLU_STALL_LOAD_FINISH : + event:0x2c010 counters:1 um:zero minimum:10000 name:PM_CMPLU_STALL_LSU : Completion stall by LSU instruction. + event:0x10036 counters:0 um:zero minimum:10000 name:PM_CMPLU_STALL_LWSYNC : completion stall due to isync/lwsync. + event:0x30028 counters:2 um:zero minimum:10000 name:PM_CMPLU_STALL_MEM_ECC_DELAY : Completion stall due to mem ECC delay. +-event:0x2e01c counters:1 um:zero minimum:10000 name:PM_CMPLU_STALL_NO_NTF : Completion stall due to nop ++event:0x2e01c counters:1 um:zero minimum:10000 name:PM_CMPLU_STALL_NO_NTF : Completion stall due to nop. + event:0x2e01e counters:1 um:zero minimum:10000 name:PM_CMPLU_STALL_NTCG_FLUSH : Completion stall due to reject (load hit store). + event:0x30006 counters:2 um:zero minimum:10000 name:PM_CMPLU_STALL_OTHER_CMPL : Instructions core completed while this thread was stalled. + event:0x4c010 counters:3 um:zero minimum:10000 name:PM_CMPLU_STALL_REJECT : Completion stall due to LSU reject. +@@ -41,59 +84,937 @@ event:0x2d010 counters:1 um:zero minimum:10000 name:PM_CMPLU_STALL_SCALAR_LONG : + event:0x2c014 counters:1 um:zero minimum:10000 name:PM_CMPLU_STALL_STORE : Completion stall by stores. + event:0x4c01c counters:3 um:zero minimum:10000 name:PM_CMPLU_STALL_ST_FWD : Completion stall due to store forward. + event:0x1001c counters:0 um:zero minimum:10000 name:PM_CMPLU_STALL_THRD : Completion stall due to thread conflict. +-event:0x2d014 counters:1 um:zero minimum:10000 name:PM_CMPLU_STALL_VECTOR : Completion stall due to VSU vector instruction. ++event:0x2d014 counters:1 um:zero minimum:10000 name:PM_CMPLU_STALL_VECTOR : Completion stall due to VSU vector instruction. + event:0x4d012 counters:3 um:zero minimum:10000 name:PM_CMPLU_STALL_VECTOR_LONG : Completion stall due to VSU vector long instruction. + event:0x2d012 counters:1 um:zero minimum:10000 name:PM_CMPLU_STALL_VSU : Completion stall due to VSU instruction. +-event:0x1c042 counters:0 um:zero minimum:10000 name:PM_DATA_FROM_L2 : The processor's data cache was reloaded from local core's L2 due to a demand load or demand load plus prefetch controlled by MMCR1[20]. +-event:0x1c040 counters:0 um:zero minimum:10000 name:PM_DATA_FROM_L2_NO_CONFLICT : The processor's data cache was reloaded from local core's L2 without conflict due to a demand load or demand load plus prefetch controlled by MMCR1[20] . +-event:0x4c042 counters:3 um:zero minimum:10000 name:PM_DATA_FROM_L3 : The processor's data cache was reloaded from local core's L3 due to a demand load. +-event:0x4c04e counters:3 um:zero minimum:10000 name:PM_DATA_FROM_L3MISS_MOD : The processor's data cache was reloaded from a localtion other than the local core's L3 due to a demand load. +-event:0x1c044 counters:0 um:zero minimum:10000 name:PM_DATA_FROM_L3_NO_CONFLICT : The processor's data cache was reloaded from local core's L3 without conflict due to a demand load or demand load plus prefetch controlled by MMCR1[20]. +-event:0x2c048 counters:1 um:zero minimum:10000 name:PM_DATA_FROM_LMEM : The processor's data cache was reloaded from the local chip's Memory due to a demand load. +-event:0x2c04c counters:1 um:zero minimum:10000 name:PM_DATA_FROM_MEMORY : The processor's data cache was reloaded from a memory location including L4 from local remote or distant due to a demand load. +-event:0x3e050 counters:2 um:zero minimum:10000 name:PM_DC_PREF_STREAM_STRIDED_CONF : A demand load referenced a line in an active strided prefetch stream. The stream could have been allocated through the hardware prefetch mechanism or through software.. +-event:0x4d01e counters:3 um:zero minimum:10000 name:PM_GCT_NOSLOT_BR_MPRED : Gct empty fo this thread due to branch mispred. +-event:0x4d01a counters:3 um:zero minimum:10000 name:PM_GCT_NOSLOT_BR_MPRED_ICMISS : Gct empty fo this thread due to Icache Miss and branch mispred. +-event:0x2d01e counters:1 um:zero minimum:10000 name:PM_GCT_NOSLOT_DISP_HELD_ISSQ : Gct empty fo this thread due to dispatch hold on this thread due to Issue q full. +-event:0x4d01c counters:3 um:zero minimum:10000 name:PM_GCT_NOSLOT_DISP_HELD_MAP : Gct empty fo this thread due to dispatch hold on this thread due to Mapper full. +-event:0x2e010 counters:1 um:zero minimum:10000 name:PM_GCT_NOSLOT_DISP_HELD_OTHER : Gct empty fo this thread due to dispatch hold on this thread due to sync. +-event:0x2d01c counters:1 um:zero minimum:10000 name:PM_GCT_NOSLOT_DISP_HELD_SRQ : Gct empty fo this thread due to dispatch hold on this thread due to SRQ full. +-event:0x4e010 counters:3 um:zero minimum:10000 name:PM_GCT_NOSLOT_IC_L3MISS : Gct empty fo this thread due to icach l3 miss. +-event:0x2d01a counters:1 um:zero minimum:10000 name:PM_GCT_NOSLOT_IC_MISS : Gct empty fo this thread due to Icache Miss. ++event:0x16083 counters:0 um:zero minimum:10000 name:PM_CO0_ALLOC : 0.0 ++event:0x16082 counters:0 um:zero minimum:10000 name:PM_CO0_BUSY : CO mach 0 Busy. Used by PMU to sample ave RC livetime(mach0 used as sample point) ++event:0x517082 counters:0 um:zero minimum:10000 name:PM_CO_DISP_FAIL : CO dispatch failed due to all CO machines being busy ++event:0x527084 counters:1 um:zero minimum:10000 name:PM_CO_TM_SC_FOOTPRINT : L2 did a cleanifdirty CO to the L3 (ie created an SC line in the L3) ++event:0x3608a counters:2 um:zero minimum:10000 name:PM_CO_USAGE : Continuous 16 cycle(2to1) window where this signals rotates thru sampling each L2 CO machine busy. PMU uses this wave to then do 16 cyc count to sample total number of machs running ++event:0x40066 counters:3 um:zero minimum:10000 name:PM_CRU_FIN : IFU Finished a (non-branch) instruction. ++event:0x61c050 counters:0 um:zero minimum:10000 name:PM_DATA_ALL_CHIP_PUMP_CPRED : Initial and Final Pump Scope and data sourced across this scope was chip pump (prediction=correct) for a demand load ++event:0x64c048 counters:3 um:zero minimum:10000 name:PM_DATA_ALL_FROM_DL2L3_MOD : The processor's data cache was reloaded with Modified (M) data from another chip's L2 or L3 on a different Node or Group (Distant), as this chip due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 ++event:0x63c048 counters:2 um:zero minimum:10000 name:PM_DATA_ALL_FROM_DL2L3_SHR : The processor's data cache was reloaded with Shared (S) data from another chip's L2 or L3 on a different Node or Group (Distant), as this chip due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 ++event:0x63c04c counters:2 um:zero minimum:10000 name:PM_DATA_ALL_FROM_DL4 : The processor's data cache was reloaded from another chip's L4 on a different Node or Group (Distant) due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 ++event:0x64c04c counters:3 um:zero minimum:10000 name:PM_DATA_ALL_FROM_DMEM : The processor's data cache was reloaded from another chip's memory on the same Node or Group (Distant) due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 ++event:0x61c042 counters:0 um:zero minimum:10000 name:PM_DATA_ALL_FROM_L2 : The processor's data cache was reloaded from local core's L2 due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 ++event:0x64c046 counters:3 um:zero minimum:10000 name:PM_DATA_ALL_FROM_L21_MOD : The processor's data cache was reloaded with Modified (M) data from another core's L2 on the same chip due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 ++event:0x63c046 counters:2 um:zero minimum:10000 name:PM_DATA_ALL_FROM_L21_SHR : The processor's data cache was reloaded with Shared (S) data from another core's L2 on the same chip due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 ++event:0x61c04e counters:0 um:zero minimum:10000 name:PM_DATA_ALL_FROM_L2MISS_MOD : The processor's data cache was reloaded from a localtion other than the local core's L2 due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 ++event:0x63c040 counters:2 um:zero minimum:10000 name:PM_DATA_ALL_FROM_L2_DISP_CONFLICT_LDHITST : The processor's data cache was reloaded from local core's L2 with load hit store conflict due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 ++event:0x64c040 counters:3 um:zero minimum:10000 name:PM_DATA_ALL_FROM_L2_DISP_CONFLICT_OTHER : The processor's data cache was reloaded from local core's L2 with dispatch conflict due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 ++event:0x62c040 counters:1 um:zero minimum:10000 name:PM_DATA_ALL_FROM_L2_MEPF : The processor's data cache was reloaded from local core's L2 hit without dispatch conflicts on Mepf state due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 ++event:0x61c040 counters:0 um:zero minimum:10000 name:PM_DATA_ALL_FROM_L2_NO_CONFLICT : The processor's data cache was reloaded from local core's L2 without conflict due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 ++event:0x64c042 counters:3 um:zero minimum:10000 name:PM_DATA_ALL_FROM_L3 : The processor's data cache was reloaded from local core's L3 due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 ++event:0x64c044 counters:3 um:zero minimum:10000 name:PM_DATA_ALL_FROM_L31_ECO_MOD : The processor's data cache was reloaded with Modified (M) data from another core's ECO L3 on the same chip due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 ++event:0x63c044 counters:2 um:zero minimum:10000 name:PM_DATA_ALL_FROM_L31_ECO_SHR : The processor's data cache was reloaded with Shared (S) data from another core's ECO L3 on the same chip due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 ++event:0x62c044 counters:1 um:zero minimum:10000 name:PM_DATA_ALL_FROM_L31_MOD : The processor's data cache was reloaded with Modified (M) data from another core's L3 on the same chip due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 ++event:0x61c046 counters:0 um:zero minimum:10000 name:PM_DATA_ALL_FROM_L31_SHR : The processor's data cache was reloaded with Shared (S) data from another core's L3 on the same chip due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 ++event:0x64c04e counters:3 um:zero minimum:10000 name:PM_DATA_ALL_FROM_L3MISS_MOD : The processor's data cache was reloaded from a localtion other than the local core's L3 due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 ++event:0x63c042 counters:2 um:zero minimum:10000 name:PM_DATA_ALL_FROM_L3_DISP_CONFLICT : The processor's data cache was reloaded from local core's L3 with dispatch conflict due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 ++event:0x62c042 counters:1 um:zero minimum:10000 name:PM_DATA_ALL_FROM_L3_MEPF : The processor's data cache was reloaded from local core's L3 without dispatch conflicts hit on Mepf state due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 ++event:0x61c044 counters:0 um:zero minimum:10000 name:PM_DATA_ALL_FROM_L3_NO_CONFLICT : The processor's data cache was reloaded from local core's L3 without conflict due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 ++event:0x61c04c counters:0 um:zero minimum:10000 name:PM_DATA_ALL_FROM_LL4 : The processor's data cache was reloaded from the local chip's L4 cache due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 ++event:0x62c048 counters:1 um:zero minimum:10000 name:PM_DATA_ALL_FROM_LMEM : The processor's data cache was reloaded from the local chip's Memory due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 ++event:0x62c04c counters:1 um:zero minimum:10000 name:PM_DATA_ALL_FROM_MEMORY : The processor's data cache was reloaded from a memory location including L4 from local remote or distant due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 ++event:0x64c04a counters:3 um:zero minimum:10000 name:PM_DATA_ALL_FROM_OFF_CHIP_CACHE : The processor's data cache was reloaded either shared or modified data from another core's L2/L3 on a different chip (remote or distant) due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 ++event:0x61c048 counters:0 um:zero minimum:10000 name:PM_DATA_ALL_FROM_ON_CHIP_CACHE : The processor's data cache was reloaded either shared or modified data from another core's L2/L3 on the same chip due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 ++event:0x62c046 counters:1 um:zero minimum:10000 name:PM_DATA_ALL_FROM_RL2L3_MOD : The processor's data cache was reloaded with Modified (M) data from another chip's L2 or L3 on the same Node or Group (Remote), as this chip due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 ++event:0x61c04a counters:0 um:zero minimum:10000 name:PM_DATA_ALL_FROM_RL2L3_SHR : The processor's data cache was reloaded with Shared (S) data from another chip's L2 or L3 on the same Node or Group (Remote), as this chip due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 ++event:0x62c04a counters:1 um:zero minimum:10000 name:PM_DATA_ALL_FROM_RL4 : The processor's data cache was reloaded from another chip's L4 on the same Node or Group ( Remote) due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 ++event:0x63c04a counters:2 um:zero minimum:10000 name:PM_DATA_ALL_FROM_RMEM : The processor's data cache was reloaded from another chip's memory on the same Node or Group ( Remote) due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 ++event:0x62c050 counters:1 um:zero minimum:10000 name:PM_DATA_ALL_GRP_PUMP_CPRED : Initial and Final Pump Scope and data sourced across this scope was group pump for a demand load ++event:0x62c052 counters:1 um:zero minimum:10000 name:PM_DATA_ALL_GRP_PUMP_MPRED : Final Pump Scope(Group) to get data sourced, ended up larger than Initial Pump Scope OR Final Pump Scope(Group) got data from source that was at smaller scope(Chip) Final pump was group pump and initial pump was chip or final and initial pump was gro ++event:0x61c052 counters:0 um:zero minimum:10000 name:PM_DATA_ALL_GRP_PUMP_MPRED_RTY : Final Pump Scope(Group) to get data sourced, ended up larger than Initial Pump Scope (Chip) Final pump was group pump and initial pump was chip pumpfor a demand load ++event:0x61c054 counters:0 um:zero minimum:10000 name:PM_DATA_ALL_PUMP_CPRED : Pump prediction correct. Counts across all types of pumps for a demand load ++event:0x64c052 counters:3 um:zero minimum:10000 name:PM_DATA_ALL_PUMP_MPRED : Pump Mis prediction Counts across all types of pumpsfor a demand load ++event:0x63c050 counters:2 um:zero minimum:10000 name:PM_DATA_ALL_SYS_PUMP_CPRED : Initial and Final Pump Scope and data sourced across this scope was system pump for a demand load ++event:0x63c052 counters:2 um:zero minimum:10000 name:PM_DATA_ALL_SYS_PUMP_MPRED : Final Pump Scope(system) to get data sourced, ended up larger than Initial Pump Scope(Chip/Group) OR Final Pump Scope(system) got data from source that was at smaller scope(Chip/group) Final pump was system pump and initial pump was chip or group or ++event:0x64c050 counters:3 um:zero minimum:10000 name:PM_DATA_ALL_SYS_PUMP_MPRED_RTY : Final Pump Scope(system) to get data sourced, ended up larger than Initial Pump Scope (Chip or Group) for a demand load ++event:0x1c050 counters:0 um:zero minimum:10000 name:PM_DATA_CHIP_PUMP_CPRED : Initial and Final Pump Scope and data sourced across this scope was chip pump (prediction=correct) for a demand load. ++event:0x4c048 counters:3 um:zero minimum:10000 name:PM_DATA_FROM_DL2L3_MOD : The processor's data cache was reloaded with Modified (M) data from another chip's L2 or L3 on a different Node or Group (Distant), as this chip due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1. ++event:0x3c048 counters:2 um:zero minimum:10000 name:PM_DATA_FROM_DL2L3_SHR : The processor's data cache was reloaded with Shared (S) data from another chip's L2 or L3 on a different Node or Group (Distant), as this chip due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1. ++event:0x3c04c counters:2 um:zero minimum:10000 name:PM_DATA_FROM_DL4 : The processor's data cache was reloaded from another chip's L4 on a different Node or Group (Distant) due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1. ++event:0x4c04c counters:3 um:zero minimum:10000 name:PM_DATA_FROM_DMEM : The processor's data cache was reloaded from another chip's memory on the same Node or Group (Distant) due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1. ++event:0x1c042 counters:0 um:zero minimum:10000 name:PM_DATA_FROM_L2 : The processor's data cache was reloaded from local core's L2 due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1. ++event:0x4c046 counters:3 um:zero minimum:10000 name:PM_DATA_FROM_L21_MOD : The processor's data cache was reloaded with Modified (M) data from another core's L2 on the same chip due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1. ++event:0x3c046 counters:2 um:zero minimum:10000 name:PM_DATA_FROM_L21_SHR : The processor's data cache was reloaded with Shared (S) data from another core's L2 on the same chip due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1. ++event:0x1c04e counters:0 um:zero minimum:10000 name:PM_DATA_FROM_L2MISS_MOD : The processor's data cache was reloaded from a localtion other than the local core's L2 due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1. ++event:0x3c040 counters:2 um:zero minimum:10000 name:PM_DATA_FROM_L2_DISP_CONFLICT_LDHITST : The processor's data cache was reloaded from local core's L2 with load hit store conflict due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1. ++event:0x4c040 counters:3 um:zero minimum:10000 name:PM_DATA_FROM_L2_DISP_CONFLICT_OTHER : The processor's data cache was reloaded from local core's L2 with dispatch conflict due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1. ++event:0x2c040 counters:1 um:zero minimum:10000 name:PM_DATA_FROM_L2_MEPF : The processor's data cache was reloaded from local core's L2 hit without dispatch conflicts on Mepf state due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1. ++event:0x1c040 counters:0 um:zero minimum:10000 name:PM_DATA_FROM_L2_NO_CONFLICT : The processor's data cache was reloaded from local core's L2 without conflict due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 . ++event:0x4c042 counters:3 um:zero minimum:10000 name:PM_DATA_FROM_L3 : The processor's data cache was reloaded from local core's L3 due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1. ++event:0x4c044 counters:3 um:zero minimum:10000 name:PM_DATA_FROM_L31_ECO_MOD : The processor's data cache was reloaded with Modified (M) data from another core's ECO L3 on the same chip due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1. ++event:0x3c044 counters:2 um:zero minimum:10000 name:PM_DATA_FROM_L31_ECO_SHR : The processor's data cache was reloaded with Shared (S) data from another core's ECO L3 on the same chip due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1. ++event:0x2c044 counters:1 um:zero minimum:10000 name:PM_DATA_FROM_L31_MOD : The processor's data cache was reloaded with Modified (M) data from another core's L3 on the same chip due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1. ++event:0x1c046 counters:0 um:zero minimum:10000 name:PM_DATA_FROM_L31_SHR : The processor's data cache was reloaded with Shared (S) data from another core's L3 on the same chip due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1. ++event:0x4c04e counters:3 um:zero minimum:10000 name:PM_DATA_FROM_L3MISS_MOD : The processor's data cache was reloaded from a localtion other than the local core's L3 due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1. ++event:0x3c042 counters:2 um:zero minimum:10000 name:PM_DATA_FROM_L3_DISP_CONFLICT : The processor's data cache was reloaded from local core's L3 with dispatch conflict due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1. ++event:0x2c042 counters:1 um:zero minimum:10000 name:PM_DATA_FROM_L3_MEPF : The processor's data cache was reloaded from local core's L3 without dispatch conflicts hit on Mepf state due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1. ++event:0x1c044 counters:0 um:zero minimum:10000 name:PM_DATA_FROM_L3_NO_CONFLICT : The processor's data cache was reloaded from local core's L3 without conflict due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1. ++event:0x1c04c counters:0 um:zero minimum:10000 name:PM_DATA_FROM_LL4 : The processor's data cache was reloaded from the local chip's L4 cache due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1. ++event:0x2c048 counters:1 um:zero minimum:10000 name:PM_DATA_FROM_LMEM : The processor's data cache was reloaded from the local chip's Memory due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1. ++event:0x2c04c counters:1 um:zero minimum:10000 name:PM_DATA_FROM_MEMORY : The processor's data cache was reloaded from a memory location including L4 from local remote or distant due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1. ++event:0x4c04a counters:3 um:zero minimum:10000 name:PM_DATA_FROM_OFF_CHIP_CACHE : The processor's data cache was reloaded either shared or modified data from another core's L2/L3 on a different chip (remote or distant) due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1. ++event:0x1c048 counters:0 um:zero minimum:10000 name:PM_DATA_FROM_ON_CHIP_CACHE : The processor's data cache was reloaded either shared or modified data from another core's L2/L3 on the same chip due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1. ++event:0x2c046 counters:1 um:zero minimum:10000 name:PM_DATA_FROM_RL2L3_MOD : The processor's data cache was reloaded with Modified (M) data from another chip's L2 or L3 on the same Node or Group (Remote), as this chip due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1. ++event:0x1c04a counters:0 um:zero minimum:10000 name:PM_DATA_FROM_RL2L3_SHR : The processor's data cache was reloaded with Shared (S) data from another chip's L2 or L3 on the same Node or Group (Remote), as this chip due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1. ++event:0x2c04a counters:1 um:zero minimum:10000 name:PM_DATA_FROM_RL4 : The processor's data cache was reloaded from another chip's L4 on the same Node or Group ( Remote) due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1. ++event:0x3c04a counters:2 um:zero minimum:10000 name:PM_DATA_FROM_RMEM : The processor's data cache was reloaded from another chip's memory on the same Node or Group ( Remote) due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1. ++event:0x2c050 counters:1 um:zero minimum:10000 name:PM_DATA_GRP_PUMP_CPRED : Initial and Final Pump Scope and data sourced across this scope was group pump for a demand load. ++event:0x2c052 counters:1 um:zero minimum:10000 name:PM_DATA_GRP_PUMP_MPRED : Final Pump Scope(Group) to get data sourced, ended up larger than Initial Pump Scope OR Final Pump Scope(Group) got data from source that was at smaller scope(Chip) Final pump was group pump and initial pump was chip or final and initial pump was gro ++event:0x1c052 counters:0 um:zero minimum:10000 name:PM_DATA_GRP_PUMP_MPRED_RTY : Final Pump Scope(Group) to get data sourced, ended up larger than Initial Pump Scope (Chip) Final pump was group pump and initial pump was chip pumpfor a demand load. ++event:0x1c054 counters:0 um:zero minimum:10000 name:PM_DATA_PUMP_CPRED : Pump prediction correct. Counts across all types of pumps for a demand load. ++event:0x4c052 counters:3 um:zero minimum:10000 name:PM_DATA_PUMP_MPRED : Pump Mis prediction Counts across all types of pumpsfor a demand load. ++event:0x3c050 counters:2 um:zero minimum:10000 name:PM_DATA_SYS_PUMP_CPRED : Initial and Final Pump Scope and data sourced across this scope was system pump for a demand load. ++event:0x3c052 counters:2 um:zero minimum:10000 name:PM_DATA_SYS_PUMP_MPRED : Final Pump Scope(system) to get data sourced, ended up larger than Initial Pump Scope(Chip/Group) OR Final Pump Scope(system) got data from source that was at smaller scope(Chip/group) Final pump was system pump and initial pump was chip or group or ++event:0x4c050 counters:3 um:zero minimum:10000 name:PM_DATA_SYS_PUMP_MPRED_RTY : Final Pump Scope(system) to get data sourced, ended up larger than Initial Pump Scope (Chip or Group) for a demand load. ++event:0x3001a counters:2 um:zero minimum:10000 name:PM_DATA_TABLEWALK_CYC : Data Tablewalk Active. ++event:0xe0bc counters:0,1,2,3 um:zero minimum:10000 name:PM_DC_COLLISIONS : DATA Cache collisions42 ++event:0x1e050 counters:0 um:zero minimum:10000 name:PM_DC_PREF_STREAM_ALLOC : Stream marked valid. The stream could have been allocated through the hardware prefetch mechanism or through software. This is combined ls0 and ls1. ++event:0x2e050 counters:1 um:zero minimum:10000 name:PM_DC_PREF_STREAM_CONF : A demand load referenced a line in an active prefetch stream. The stream could have been allocated through the hardware prefetch mechanism or through software. Combine up + down. ++event:0x4e050 counters:3 um:zero minimum:10000 name:PM_DC_PREF_STREAM_FUZZY_CONF : A demand load referenced a line in an active fuzzy prefetch stream. The stream could have been allocated through the hardware prefetch mechanism or through software.Fuzzy stream confirm (out of order effects, or pf cant keep up). ++event:0x3e050 counters:2 um:zero minimum:10000 name:PM_DC_PREF_STREAM_STRIDED_CONF : A demand load referenced a line in an active strided prefetch stream. The stream could have been allocated through the hardware prefetch mechanism or through software.. ++event:0x4c054 counters:3 um:zero minimum:10000 name:PM_DERAT_MISS_16G : Data ERAT Miss (Data TLB Access) page size 16G. ++event:0x3c054 counters:2 um:zero minimum:10000 name:PM_DERAT_MISS_16M : Data ERAT Miss (Data TLB Access) page size 16M. ++event:0x1c056 counters:0 um:zero minimum:10000 name:PM_DERAT_MISS_4K : Data ERAT Miss (Data TLB Access) page size 4K. ++event:0x2c054 counters:1 um:zero minimum:10000 name:PM_DERAT_MISS_64K : Data ERAT Miss (Data TLB Access) page size 64K. ++event:0xb0ba counters:0,1,2,3 um:zero minimum:10000 name:PM_DFU : Finish DFU (all finish) ++event:0xb0be counters:0,1,2,3 um:zero minimum:10000 name:PM_DFU_DCFFIX : Convert from fixed opcode finish (dcffix,dcffixq) ++event:0xb0bc counters:0,1,2,3 um:zero minimum:10000 name:PM_DFU_DENBCD : BCD->DPD opcode finish (denbcd, denbcdq) ++event:0xb0b8 counters:0,1,2,3 um:zero minimum:10000 name:PM_DFU_MC : Finish DFU multicycle ++event:0x2092 counters:0,1,2,3 um:zero minimum:10000 name:PM_DISP_CLB_HELD_BAL : Dispatch/CLB Hold: Balance ++event:0x2094 counters:0,1,2,3 um:zero minimum:10000 name:PM_DISP_CLB_HELD_RES : Dispatch/CLB Hold: Resource ++event:0x20a8 counters:0,1,2,3 um:zero minimum:10000 name:PM_DISP_CLB_HELD_SB : Dispatch/CLB Hold: Scoreboard ++event:0x2098 counters:0,1,2,3 um:zero minimum:10000 name:PM_DISP_CLB_HELD_SYNC : Dispatch/CLB Hold: Sync type instruction ++event:0x2096 counters:0,1,2,3 um:zero minimum:10000 name:PM_DISP_CLB_HELD_TLBIE : Dispatch Hold: Due to TLBIE ++event:0x10006 counters:0 um:zero minimum:10000 name:PM_DISP_HELD : Dispatch Held. ++event:0x20006 counters:1 um:zero minimum:10000 name:PM_DISP_HELD_IQ_FULL : Dispatch held due to Issue q full. ++event:0x1002a counters:0 um:zero minimum:10000 name:PM_DISP_HELD_MAP_FULL : Dispatch held due to Mapper full. ++event:0x30018 counters:2 um:zero minimum:10000 name:PM_DISP_HELD_SRQ_FULL : Dispatch held due SRQ no room. ++event:0x4003c counters:3 um:zero minimum:10000 name:PM_DISP_HELD_SYNC_HOLD : Dispatch held due to SYNC hold. ++event:0x30a6 counters:0,1,2,3 um:zero minimum:10000 name:PM_DISP_HOLD_GCT_FULL : Dispatch Hold Due to no space in the GCT ++event:0x30008 counters:2 um:zero minimum:10000 name:PM_DISP_WT : Dispatched Starved (not held, nothing to dispatch). ++event:0x4e048 counters:3 um:zero minimum:10000 name:PM_DPTEG_FROM_DL2L3_MOD : A Page Table Entry was loaded into the TLB with Modified (M) data from another chip's L2 or L3 on a different Node or Group (Distant), as this chip due to a data side request. ++event:0x3e048 counters:2 um:zero minimum:10000 name:PM_DPTEG_FROM_DL2L3_SHR : A Page Table Entry was loaded into the TLB with Shared (S) data from another chip's L2 or L3 on a different Node or Group (Distant), as this chip due to a data side request. ++event:0x3e04c counters:2 um:zero minimum:10000 name:PM_DPTEG_FROM_DL4 : A Page Table Entry was loaded into the TLB from another chip's L4 on a different Node or Group (Distant) due to a data side request. ++event:0x4e04c counters:3 um:zero minimum:10000 name:PM_DPTEG_FROM_DMEM : A Page Table Entry was loaded into the TLB from another chip's memory on the same Node or Group (Distant) due to a data side request. ++event:0x1e042 counters:0 um:zero minimum:10000 name:PM_DPTEG_FROM_L2 : A Page Table Entry was loaded into the TLB from local core's L2 due to a data side request. ++event:0x4e046 counters:3 um:zero minimum:10000 name:PM_DPTEG_FROM_L21_MOD : A Page Table Entry was loaded into the TLB with Modified (M) data from another core's L2 on the same chip due to a data side request. ++event:0x3e046 counters:2 um:zero minimum:10000 name:PM_DPTEG_FROM_L21_SHR : A Page Table Entry was loaded into the TLB with Shared (S) data from another core's L2 on the same chip due to a data side request. ++event:0x1e04e counters:0 um:zero minimum:10000 name:PM_DPTEG_FROM_L2MISS : A Page Table Entry was loaded into the TLB from a localtion other than the local core's L2 due to a data side request. ++event:0x3e040 counters:2 um:zero minimum:10000 name:PM_DPTEG_FROM_L2_DISP_CONFLICT_LDHITST : A Page Table Entry was loaded into the TLB from local core's L2 with load hit store conflict due to a data side request. ++event:0x4e040 counters:3 um:zero minimum:10000 name:PM_DPTEG_FROM_L2_DISP_CONFLICT_OTHER : A Page Table Entry was loaded into the TLB from local core's L2 with dispatch conflict due to a data side request. ++event:0x2e040 counters:1 um:zero minimum:10000 name:PM_DPTEG_FROM_L2_MEPF : A Page Table Entry was loaded into the TLB from local core's L2 hit without dispatch conflicts on Mepf state. due to a data side request. ++event:0x1e040 counters:0 um:zero minimum:10000 name:PM_DPTEG_FROM_L2_NO_CONFLICT : A Page Table Entry was loaded into the TLB from local core's L2 without conflict due to a data side request. ++event:0x4e042 counters:3 um:zero minimum:10000 name:PM_DPTEG_FROM_L3 : A Page Table Entry was loaded into the TLB from local core's L3 due to a data side request. ++event:0x4e044 counters:3 um:zero minimum:10000 name:PM_DPTEG_FROM_L31_ECO_MOD : A Page Table Entry was loaded into the TLB with Modified (M) data from another core's ECO L3 on the same chip due to a data side request. ++event:0x3e044 counters:2 um:zero minimum:10000 name:PM_DPTEG_FROM_L31_ECO_SHR : A Page Table Entry was loaded into the TLB with Shared (S) data from another core's ECO L3 on the same chip due to a data side request. ++event:0x2e044 counters:1 um:zero minimum:10000 name:PM_DPTEG_FROM_L31_MOD : A Page Table Entry was loaded into the TLB with Modified (M) data from another core's L3 on the same chip due to a data side request. ++event:0x1e046 counters:0 um:zero minimum:10000 name:PM_DPTEG_FROM_L31_SHR : A Page Table Entry was loaded into the TLB with Shared (S) data from another core's L3 on the same chip due to a data side request. ++event:0x4e04e counters:3 um:zero minimum:10000 name:PM_DPTEG_FROM_L3MISS : A Page Table Entry was loaded into the TLB from a localtion other than the local core's L3 due to a data side request. ++event:0x3e042 counters:2 um:zero minimum:10000 name:PM_DPTEG_FROM_L3_DISP_CONFLICT : A Page Table Entry was loaded into the TLB from local core's L3 with dispatch conflict due to a data side request. ++event:0x2e042 counters:1 um:zero minimum:10000 name:PM_DPTEG_FROM_L3_MEPF : A Page Table Entry was loaded into the TLB from local core's L3 without dispatch conflicts hit on Mepf state. due to a data side request. ++event:0x1e044 counters:0 um:zero minimum:10000 name:PM_DPTEG_FROM_L3_NO_CONFLICT : A Page Table Entry was loaded into the TLB from local core's L3 without conflict due to a data side request. ++event:0x1e04c counters:0 um:zero minimum:10000 name:PM_DPTEG_FROM_LL4 : A Page Table Entry was loaded into the TLB from the local chip's L4 cache due to a data side request. ++event:0x2e048 counters:1 um:zero minimum:10000 name:PM_DPTEG_FROM_LMEM : A Page Table Entry was loaded into the TLB from the local chip's Memory due to a data side request. ++event:0x2e04c counters:1 um:zero minimum:10000 name:PM_DPTEG_FROM_MEMORY : A Page Table Entry was loaded into the TLB from a memory location including L4 from local remote or distant due to a data side request. ++event:0x4e04a counters:3 um:zero minimum:10000 name:PM_DPTEG_FROM_OFF_CHIP_CACHE : A Page Table Entry was loaded into the TLB either shared or modified data from another core's L2/L3 on a different chip (remote or distant) due to a data side request. ++event:0x1e048 counters:0 um:zero minimum:10000 name:PM_DPTEG_FROM_ON_CHIP_CACHE : A Page Table Entry was loaded into the TLB either shared or modified data from another core's L2/L3 on the same chip due to a data side request. ++event:0x2e046 counters:1 um:zero minimum:10000 name:PM_DPTEG_FROM_RL2L3_MOD : A Page Table Entry was loaded into the TLB with Modified (M) data from another chip's L2 or L3 on the same Node or Group (Remote), as this chip due to a data side request. ++event:0x1e04a counters:0 um:zero minimum:10000 name:PM_DPTEG_FROM_RL2L3_SHR : A Page Table Entry was loaded into the TLB with Shared (S) data from another chip's L2 or L3 on the same Node or Group (Remote), as this chip due to a data side request. ++event:0x2e04a counters:1 um:zero minimum:10000 name:PM_DPTEG_FROM_RL4 : A Page Table Entry was loaded into the TLB from another chip's L4 on the same Node or Group ( Remote) due to a data side request. ++event:0x3e04a counters:2 um:zero minimum:10000 name:PM_DPTEG_FROM_RMEM : A Page Table Entry was loaded into the TLB from another chip's memory on the same Node or Group ( Remote) due to a data side request. ++event:0xd094 counters:0,1,2,3 um:zero minimum:10000 name:PM_DSLB_MISS : Data SLB Miss - Total of all segment sizesData SLB misses ++event:0x1c058 counters:0 um:zero minimum:10000 name:PM_DTLB_MISS_16G : Data TLB Miss page size 16G. ++event:0x4c056 counters:3 um:zero minimum:10000 name:PM_DTLB_MISS_16M : Data TLB Miss page size 16M. ++event:0x2c056 counters:1 um:zero minimum:10000 name:PM_DTLB_MISS_4K : Data TLB Miss page size 4k. ++event:0x3c056 counters:2 um:zero minimum:10000 name:PM_DTLB_MISS_64K : Data TLB Miss page size 64K. ++event:0x50a8 counters:0,1,2,3 um:zero minimum:10000 name:PM_EAT_FORCE_MISPRED : XL-form branch was mispredicted due to the predicted target address missing from EAT. The EAT forces a mispredict in this case since there is no predicated target to validate. This is a rare case that may occur when the EAT is full and a branch is ++event:0x4084 counters:0,1,2,3 um:zero minimum:10000 name:PM_EAT_FULL_CYC : Cycles No room in EATSet on bank conflict and case where no ibuffers available. ++event:0x2080 counters:0,1,2,3 um:zero minimum:10000 name:PM_EE_OFF_EXT_INT : Ee off and external interrupt ++event:0x20b4 counters:0,1,2,3 um:zero minimum:10000 name:PM_FAV_TBEGIN : Dispatch time Favored tbegin ++event:0xa0ae counters:0,1,2,3 um:zero minimum:10000 name:PM_FLOP_SUM_SCALAR : flops summary scalar instructions ++event:0xa0ac counters:0,1,2,3 um:zero minimum:10000 name:PM_FLOP_SUM_VEC : flops summary vector instructions ++event:0x2084 counters:0,1,2,3 um:zero minimum:10000 name:PM_FLUSH_BR_MPRED : Flush caused by branch mispredict ++event:0x30012 counters:2 um:zero minimum:10000 name:PM_FLUSH_COMPLETION : Completion Flush. ++event:0x2082 counters:0,1,2,3 um:zero minimum:10000 name:PM_FLUSH_DISP : Dispatch flush ++event:0x208c counters:0,1,2,3 um:zero minimum:10000 name:PM_FLUSH_DISP_SB : Dispatch Flush: Scoreboard ++event:0x2088 counters:0,1,2,3 um:zero minimum:10000 name:PM_FLUSH_DISP_SYNC : Dispatch Flush: Sync ++event:0x208a counters:0,1,2,3 um:zero minimum:10000 name:PM_FLUSH_DISP_TLBIE : Dispatch Flush: TLBIE ++event:0x208e counters:0,1,2,3 um:zero minimum:10000 name:PM_FLUSH_LSU : Flush initiated by LSU ++event:0x2086 counters:0,1,2,3 um:zero minimum:10000 name:PM_FLUSH_PARTIAL : Partial flush ++event:0xa0b0 counters:0,1,2,3 um:zero minimum:10000 name:PM_FPU0_FCONV : Convert instruction executed ++event:0xa0b8 counters:0,1,2,3 um:zero minimum:10000 name:PM_FPU0_FEST : Estimate instruction executed ++event:0xa0b4 counters:0,1,2,3 um:zero minimum:10000 name:PM_FPU0_FRSP : Round to single precision instruction executed ++event:0xa0b2 counters:0,1,2,3 um:zero minimum:10000 name:PM_FPU1_FCONV : Convert instruction executed ++event:0xa0ba counters:0,1,2,3 um:zero minimum:10000 name:PM_FPU1_FEST : Estimate instruction executed ++event:0xa0b6 counters:0,1,2,3 um:zero minimum:10000 name:PM_FPU1_FRSP : Round to single precision instruction executed ++event:0x3000c counters:2 um:zero minimum:10000 name:PM_FREQ_DOWN : Frequency is being slewed down due to Power Management. ++event:0x4000c counters:3 um:zero minimum:10000 name:PM_FREQ_UP : Frequency is being slewed up due to Power Management. ++event:0x50b0 counters:0,1,2,3 um:zero minimum:10000 name:PM_FUSION_TOC_GRP0_1 : One pair of instructions fused with TOC in Group0 ++event:0x50ae counters:0,1,2,3 um:zero minimum:10000 name:PM_FUSION_TOC_GRP0_2 : Two pairs of instructions fused with TOCin Group0 ++event:0x50ac counters:0,1,2,3 um:zero minimum:10000 name:PM_FUSION_TOC_GRP0_3 : Three pairs of instructions fused with TOC in Group0 ++event:0x50b2 counters:0,1,2,3 um:zero minimum:10000 name:PM_FUSION_TOC_GRP1_1 : One pair of instructions fused with TOX in Group1 ++event:0x50b8 counters:0,1,2,3 um:zero minimum:10000 name:PM_FUSION_VSX_GRP0_1 : One pair of instructions fused with VSX in Group0 ++event:0x50b6 counters:0,1,2,3 um:zero minimum:10000 name:PM_FUSION_VSX_GRP0_2 : Two pairs of instructions fused with VSX in Group0 ++event:0x50b4 counters:0,1,2,3 um:zero minimum:10000 name:PM_FUSION_VSX_GRP0_3 : Three pairs of instructions fused with VSX in Group0 ++event:0x50ba counters:0,1,2,3 um:zero minimum:10000 name:PM_FUSION_VSX_GRP1_1 : One pair of instructions fused with VSX in Group1 ++event:0x3000e counters:2 um:zero minimum:10000 name:PM_FXU0_BUSY_FXU1_IDLE : fxu0 busy and fxu1 idle. ++event:0x10004 counters:0 um:zero minimum:10000 name:PM_FXU0_FIN : FXU0 Finished. ++event:0x4000e counters:3 um:zero minimum:10000 name:PM_FXU1_BUSY_FXU0_IDLE : fxu0 idle and fxu1 busy. . ++event:0x40004 counters:3 um:zero minimum:10000 name:PM_FXU1_FIN : FXU1 Finished. ++event:0x2000e counters:1 um:zero minimum:10000 name:PM_FXU_BUSY : fxu0 busy and fxu1 busy.. ++event:0x1000e counters:0 um:zero minimum:10000 name:PM_FXU_IDLE : fxu0 idle and fxu1 idle. ++event:0x20008 counters:1 um:zero minimum:10000 name:PM_GCT_EMPTY_CYC : No itags assigned either thread (GCT Empty). ++event:0x30a4 counters:0,1,2,3 um:zero minimum:10000 name:PM_GCT_MERGE : Group dispatched on a merged GCT empty. GCT entries can be merged only within the same thread ++event:0x4d01e counters:3 um:zero minimum:10000 name:PM_GCT_NOSLOT_BR_MPRED : Gct empty for this thread due to branch mispred. ++event:0x4d01a counters:3 um:zero minimum:10000 name:PM_GCT_NOSLOT_BR_MPRED_ICMISS : Gct empty for this thread due to Icache Miss and branch mispred. ++event:0x2d01e counters:1 um:zero minimum:10000 name:PM_GCT_NOSLOT_DISP_HELD_ISSQ : Gct empty for this thread due to dispatch hold on this thread due to Issue q full. ++event:0x4d01c counters:3 um:zero minimum:10000 name:PM_GCT_NOSLOT_DISP_HELD_MAP : Gct empty for this thread due to dispatch hold on this thread due to Mapper full. ++event:0x2e010 counters:1 um:zero minimum:10000 name:PM_GCT_NOSLOT_DISP_HELD_OTHER : Gct empty for this thread due to dispatch hold on this thread due to sync. ++event:0x2d01c counters:1 um:zero minimum:10000 name:PM_GCT_NOSLOT_DISP_HELD_SRQ : Gct empty for this thread due to dispatch hold on this thread due to SRQ full. ++event:0x4e010 counters:3 um:zero minimum:10000 name:PM_GCT_NOSLOT_IC_L3MISS : Gct empty for this thread due to icach l3 miss. ++event:0x2d01a counters:1 um:zero minimum:10000 name:PM_GCT_NOSLOT_IC_MISS : Gct empty for this thread due to Icache Miss. ++event:0x20a2 counters:0,1,2,3 um:zero minimum:10000 name:PM_GCT_UTIL_11_14_ENTRIES : GCT Utilization 11-14 entries ++event:0x20a4 counters:0,1,2,3 um:zero minimum:10000 name:PM_GCT_UTIL_15_17_ENTRIES : GCT Utilization 15-17 entries ++event:0x20a6 counters:0,1,2,3 um:zero minimum:10000 name:PM_GCT_UTIL_18_ENTRIES : GCT Utilization 18+ entries ++event:0x209c counters:0,1,2,3 um:zero minimum:10000 name:PM_GCT_UTIL_1_2_ENTRIES : GCT Utilization 1-2 entries ++event:0x209e counters:0,1,2,3 um:zero minimum:10000 name:PM_GCT_UTIL_3_6_ENTRIES : GCT Utilization 3-6 entries ++event:0x20a0 counters:0,1,2,3 um:zero minimum:10000 name:PM_GCT_UTIL_7_10_ENTRIES : GCT Utilization 7-10 entries ++event:0x1000a counters:0 um:zero minimum:10000 name:PM_GRP_BR_MPRED_NONSPEC : Group experienced Non-speculative br mispredicct. ++event:0x30004 counters:2 um:zero minimum:100000 name:PM_GRP_CMPL : group completed. + event:0x3000a counters:2 um:zero minimum:100000 name:PM_GRP_DISP : dispatch_success (Group Dispatched). ++event:0x1000c counters:0 um:zero minimum:10000 name:PM_GRP_IC_MISS_NONSPEC : Group experi enced Non-specu lative I cache miss. + event:0x10130 counters:0 um:zero minimum:10000 name:PM_GRP_MRK : Instruction marked in idu. ++event:0x509c counters:0,1,2,3 um:zero minimum:10000 name:PM_GRP_NON_FULL_GROUP : GROUPs where we did not have 6 non branch instructions in the group(ST mode), in SMT mode 3 non branches ++event:0x20050 counters:1 um:zero minimum:10000 name:PM_GRP_PUMP_CPRED : Initial and Final Pump Scope and data sourced across this scope was group pump for all data types excluding data prefetch (demand load,inst prefetch,inst fetch,xlate). ++event:0x20052 counters:1 um:zero minimum:10000 name:PM_GRP_PUMP_MPRED : Final Pump Scope(Group) to get data sourced, ended up larger than Initial Pump Scope OR Final Pump Scope(Group) got data from source that was at smaller scope(Chip) Final pump was group pump and initial pump was chip or final and initial pump was gro ++event:0x10052 counters:0 um:zero minimum:10000 name:PM_GRP_PUMP_MPRED_RTY : Final Pump Scope(Group) to get data sourced, ended up larger than Initial Pump Scope (Chip) Final pump was group pump and initial pump was chip pumpfor all data types excluding data prefetch (demand load,inst prefetch,inst fetch,xlate). ++event:0x50a4 counters:0,1,2,3 um:zero minimum:10000 name:PM_GRP_TERM_2ND_BRANCH : There were enough instructions in the Ibuffer, but 2nd branch ends group ++event:0x50a6 counters:0,1,2,3 um:zero minimum:10000 name:PM_GRP_TERM_FPU_AFTER_BR : There were enough instructions in the Ibuffer, but FPU OP IN same group after a branch terminates a group, cant do partial flushes ++event:0x509e counters:0,1,2,3 um:zero minimum:10000 name:PM_GRP_TERM_NOINST : Do not fill every slot in the group, Not enough instructions in the Ibuffer. This includes cases where the group started with enough instructions, but some got knocked out by a cache miss or branch redirect (which would also empty the Ibuffer). ++event:0x50a0 counters:0,1,2,3 um:zero minimum:10000 name:PM_GRP_TERM_OTHER : There were enough instructions in the Ibuffer, but the group terminated early for some other reason, most likely due to a First or Last. ++event:0x50a2 counters:0,1,2,3 um:zero minimum:10000 name:PM_GRP_TERM_SLOT_LIMIT : There were enough instructions in the Ibuffer, but 3 src RA/RB/RC , 2 way crack caused a group termination + event:0x2000a counters:1 um:zero minimum:10000 name:PM_HV_CYC : cycles in hypervisor mode . ++event:0x4086 counters:0,1,2,3 um:zero minimum:10000 name:PM_IBUF_FULL_CYC : Cycles No room in ibufffully qualified tranfer (if5 valid). ++event:0x10018 counters:0 um:zero minimum:10000 name:PM_IC_DEMAND_CYC : Demand ifetch pending. ++event:0x4098 counters:0,1,2,3 um:zero minimum:10000 name:PM_IC_DEMAND_L2_BHT_REDIRECT : L2 I cache demand request due to BHT redirect, branch redirect ( 2 bubbles 3 cycles) ++event:0x409a counters:0,1,2,3 um:zero minimum:10000 name:PM_IC_DEMAND_L2_BR_REDIRECT : L2 I cache demand request due to branch Mispredict ( 15 cycle path) ++event:0x4088 counters:0,1,2,3 um:zero minimum:10000 name:PM_IC_DEMAND_REQ : Demand Instruction fetch request ++event:0x508a counters:0,1,2,3 um:zero minimum:10000 name:PM_IC_INVALIDATE : Ic line invalidated ++event:0x4092 counters:0,1,2,3 um:zero minimum:10000 name:PM_IC_PREF_CANCEL_HIT : Prefetch Canceled due to icache hit ++event:0x4094 counters:0,1,2,3 um:zero minimum:10000 name:PM_IC_PREF_CANCEL_L2 : L2 Squashed request ++event:0x4090 counters:0,1,2,3 um:zero minimum:10000 name:PM_IC_PREF_CANCEL_PAGE : Prefetch Canceled due to page boundary ++event:0x408a counters:0,1,2,3 um:zero minimum:10000 name:PM_IC_PREF_REQ : Instruction prefetch requests ++event:0x408e counters:0,1,2,3 um:zero minimum:10000 name:PM_IC_PREF_WRITE : Instruction prefetch written into IL1 ++event:0x4096 counters:0,1,2,3 um:zero minimum:10000 name:PM_IC_RELOAD_PRIVATE : Reloading line was brought in private for a specific thread. Most lines are brought in shared for all eight thrreads. If RA does not match then invalidates and then brings it shared to other thread. In P7 line brought in private , then line was inv ++event:0x4006a counters:3 um:zero minimum:10000 name:PM_IERAT_RELOAD_16M : IERAT Reloaded (Miss) for a 16M page. ++event:0x20064 counters:1 um:zero minimum:10000 name:PM_IERAT_RELOAD_4K : IERAT Reloaded (Miss) for a 4k page. ++event:0x3006a counters:2 um:zero minimum:10000 name:PM_IERAT_RELOAD_64K : IERAT Reloaded (Miss) for a 64k page. ++event:0x3405e counters:2 um:zero minimum:10000 name:PM_IFETCH_THROTTLE : Cycles instruction fecth was throttled in IFU. ++event:0x5088 counters:0,1,2,3 um:zero minimum:10000 name:PM_IFU_L2_TOUCH : L2 touch to update MRU on a line ++event:0x514050 counters:0 um:zero minimum:10000 name:PM_INST_ALL_CHIP_PUMP_CPRED : Initial and Final Pump Scope and data sourced across this scope was chip pump (prediction=correct) for an instruction fetch ++event:0x544048 counters:3 um:zero minimum:10000 name:PM_INST_ALL_FROM_DL2L3_MOD : The processor's Instruction cache was reloaded with Modified (M) data from another chip's L2 or L3 on a different Node or Group (Distant), as this chip due to either an instruction fetch or instruction fetch plus prefetch if MMCR1[17] is 1 ++event:0x534048 counters:2 um:zero minimum:10000 name:PM_INST_ALL_FROM_DL2L3_SHR : The processor's Instruction cache was reloaded with Shared (S) data from another chip's L2 or L3 on a different Node or Group (Distant), as this chip due to either an instruction fetch or instruction fetch plus prefetch if MMCR1[17] is 1 ++event:0x53404c counters:2 um:zero minimum:10000 name:PM_INST_ALL_FROM_DL4 : The processor's Instruction cache was reloaded from another chip's L4 on a different Node or Group (Distant) due to either an instruction fetch or instruction fetch plus prefetch if MMCR1[17] is 1 ++event:0x54404c counters:3 um:zero minimum:10000 name:PM_INST_ALL_FROM_DMEM : The processor's Instruction cache was reloaded from another chip's memory on the same Node or Group (Distant) due to either an instruction fetch or instruction fetch plus prefetch if MMCR1[17] is 1 ++event:0x514042 counters:0 um:zero minimum:10000 name:PM_INST_ALL_FROM_L2 : The processor's Instruction cache was reloaded from local core's L2 due to either an instruction fetch or instruction fetch plus prefetch if MMCR1[17] is 1 ++event:0x544046 counters:3 um:zero minimum:10000 name:PM_INST_ALL_FROM_L21_MOD : The processor's Instruction cache was reloaded with Modified (M) data from another core's L2 on the same chip due to either an instruction fetch or instruction fetch plus prefetch if MMCR1[17] is 1 ++event:0x534046 counters:2 um:zero minimum:10000 name:PM_INST_ALL_FROM_L21_SHR : The processor's Instruction cache was reloaded with Shared (S) data from another core's L2 on the same chip due to either an instruction fetch or instruction fetch plus prefetch if MMCR1[17] is 1 ++event:0x51404e counters:0 um:zero minimum:10000 name:PM_INST_ALL_FROM_L2MISS : The processor's Instruction cache was reloaded from a localtion other than the local core's L2 due to either an instruction fetch or instruction fetch plus prefetch if MMCR1[17] is 1 ++event:0x534040 counters:2 um:zero minimum:10000 name:PM_INST_ALL_FROM_L2_DISP_CONFLICT_LDHITST : The processor's Instruction cache was reloaded from local core's L2 with load hit store conflict due to either an instruction fetch or instruction fetch plus prefetch if MMCR1[17] is 1 ++event:0x544040 counters:3 um:zero minimum:10000 name:PM_INST_ALL_FROM_L2_DISP_CONFLICT_OTHER : The processor's Instruction cache was reloaded from local core's L2 with dispatch conflict due to either an instruction fetch or instruction fetch plus prefetch if MMCR1[17] is 1 ++event:0x524040 counters:1 um:zero minimum:10000 name:PM_INST_ALL_FROM_L2_MEPF : The processor's Instruction cache was reloaded from local core's L2 hit without dispatch conflicts on Mepf state. due to either an instruction fetch or instruction fetch plus prefetch if MMCR1[17] is 1 ++event:0x514040 counters:0 um:zero minimum:10000 name:PM_INST_ALL_FROM_L2_NO_CONFLICT : The processor's Instruction cache was reloaded from local core's L2 without conflict due to either an instruction fetch or instruction fetch plus prefetch if MMCR1[17] is 1 ++event:0x544042 counters:3 um:zero minimum:10000 name:PM_INST_ALL_FROM_L3 : The processor's Instruction cache was reloaded from local core's L3 due to either an instruction fetch or instruction fetch plus prefetch if MMCR1[17] is 1 ++event:0x544044 counters:3 um:zero minimum:10000 name:PM_INST_ALL_FROM_L31_ECO_MOD : The processor's Instruction cache was reloaded with Modified (M) data from another core's ECO L3 on the same chip due to either an instruction fetch or instruction fetch plus prefetch if MMCR1[17] is 1 ++event:0x534044 counters:2 um:zero minimum:10000 name:PM_INST_ALL_FROM_L31_ECO_SHR : The processor's Instruction cache was reloaded with Shared (S) data from another core's ECO L3 on the same chip due to either an instruction fetch or instruction fetch plus prefetch if MMCR1[17] is 1 ++event:0x524044 counters:1 um:zero minimum:10000 name:PM_INST_ALL_FROM_L31_MOD : The processor's Instruction cache was reloaded with Modified (M) data from another core's L3 on the same chip due to either an instruction fetch or instruction fetch plus prefetch if MMCR1[17] is 1 ++event:0x514046 counters:0 um:zero minimum:10000 name:PM_INST_ALL_FROM_L31_SHR : The processor's Instruction cache was reloaded with Shared (S) data from another core's L3 on the same chip due to either an instruction fetch or instruction fetch plus prefetch if MMCR1[17] is 1 ++event:0x54404e counters:3 um:zero minimum:10000 name:PM_INST_ALL_FROM_L3MISS_MOD : The processor's Instruction cache was reloaded from a localtion other than the local core's L3 due to either an instruction fetch or instruction fetch plus prefetch if MMCR1[17] is 1 ++event:0x534042 counters:2 um:zero minimum:10000 name:PM_INST_ALL_FROM_L3_DISP_CONFLICT : The processor's Instruction cache was reloaded from local core's L3 with dispatch conflict due to either an instruction fetch or instruction fetch plus prefetch if MMCR1[17] is 1 ++event:0x524042 counters:1 um:zero minimum:10000 name:PM_INST_ALL_FROM_L3_MEPF : The processor's Instruction cache was reloaded from local core's L3 without dispatch conflicts hit on Mepf state. due to either an instruction fetch or instruction fetch plus prefetch if MMCR1[17] is 1 ++event:0x514044 counters:0 um:zero minimum:10000 name:PM_INST_ALL_FROM_L3_NO_CONFLICT : The processor's Instruction cache was reloaded from local core's L3 without conflict due to either an instruction fetch or instruction fetch plus prefetch if MMCR1[17] is 1 ++event:0x51404c counters:0 um:zero minimum:10000 name:PM_INST_ALL_FROM_LL4 : The processor's Instruction cache was reloaded from the local chip's L4 cache due to either an instruction fetch or instruction fetch plus prefetch if MMCR1[17] is 1 ++event:0x524048 counters:1 um:zero minimum:10000 name:PM_INST_ALL_FROM_LMEM : The processor's Instruction cache was reloaded from the local chip's Memory due to either an instruction fetch or instruction fetch plus prefetch if MMCR1[17] is 1 ++event:0x52404c counters:1 um:zero minimum:10000 name:PM_INST_ALL_FROM_MEMORY : The processor's Instruction cache was reloaded from a memory location including L4 from local remote or distant due to either an instruction fetch or instruction fetch plus prefetch if MMCR1[17] is 1 ++event:0x54404a counters:3 um:zero minimum:10000 name:PM_INST_ALL_FROM_OFF_CHIP_CACHE : The processor's Instruction cache was reloaded either shared or modified data from another core's L2/L3 on a different chip (remote or distant) due to either an instruction fetch or instruction fetch plus prefetch if MMCR1[17] is 1 ++event:0x514048 counters:0 um:zero minimum:10000 name:PM_INST_ALL_FROM_ON_CHIP_CACHE : The processor's Instruction cache was reloaded either shared or modified data from another core's L2/L3 on the same chip due to either an instruction fetch or instruction fetch plus prefetch if MMCR1[17] is 1 ++event:0x524046 counters:1 um:zero minimum:10000 name:PM_INST_ALL_FROM_RL2L3_MOD : The processor's Instruction cache was reloaded with Modified (M) data from another chip's L2 or L3 on the same Node or Group (Remote), as this chip due to either an instruction fetch or instruction fetch plus prefetch if MMCR1[17] is 1 ++event:0x51404a counters:0 um:zero minimum:10000 name:PM_INST_ALL_FROM_RL2L3_SHR : The processor's Instruction cache was reloaded with Shared (S) data from another chip's L2 or L3 on the same Node or Group (Remote), as this chip due to either an instruction fetch or instruction fetch plus prefetch if MMCR1[17] is 1 ++event:0x52404a counters:1 um:zero minimum:10000 name:PM_INST_ALL_FROM_RL4 : The processor's Instruction cache was reloaded from another chip's L4 on the same Node or Group ( Remote) due to either an instruction fetch or instruction fetch plus prefetch if MMCR1[17] is 1 ++event:0x53404a counters:2 um:zero minimum:10000 name:PM_INST_ALL_FROM_RMEM : The processor's Instruction cache was reloaded from another chip's memory on the same Node or Group ( Remote) due to either an instruction fetch or instruction fetch plus prefetch if MMCR1[17] is 1 ++event:0x524050 counters:1 um:zero minimum:10000 name:PM_INST_ALL_GRP_PUMP_CPRED : Initial and Final Pump Scope and data sourced across this scope was group pump for an instruction fetch ++event:0x524052 counters:1 um:zero minimum:10000 name:PM_INST_ALL_GRP_PUMP_MPRED : Final Pump Scope(Group) to get data sourced, ended up larger than Initial Pump Scope OR Final Pump Scope(Group) got data from source that was at smaller scope(Chip) Final pump was group pump and initial pump was chip or final and initial pump was gro ++event:0x514052 counters:0 um:zero minimum:10000 name:PM_INST_ALL_GRP_PUMP_MPRED_RTY : Final Pump Scope(Group) to get data sourced, ended up larger than Initial Pump Scope (Chip) Final pump was group pump and initial pump was chip pumpfor an instruction fetch ++event:0x514054 counters:0 um:zero minimum:10000 name:PM_INST_ALL_PUMP_CPRED : Pump prediction correct. Counts across all types of pumpsfor an instruction fetch ++event:0x544052 counters:3 um:zero minimum:10000 name:PM_INST_ALL_PUMP_MPRED : Pump Mis prediction Counts across all types of pumpsfor an instruction fetch ++event:0x534050 counters:2 um:zero minimum:10000 name:PM_INST_ALL_SYS_PUMP_CPRED : Initial and Final Pump Scope and data sourced across this scope was system pump for an instruction fetch ++event:0x534052 counters:2 um:zero minimum:10000 name:PM_INST_ALL_SYS_PUMP_MPRED : Final Pump Scope(system) to get data sourced, ended up larger than Initial Pump Scope(Chip/Group) OR Final Pump Scope(system) got data from source that was at smaller scope(Chip/group) Final pump was system pump and initial pump was chip or group or ++event:0x544050 counters:3 um:zero minimum:10000 name:PM_INST_ALL_SYS_PUMP_MPRED_RTY : Final Pump Scope(system) to get data sourced, ended up larger than Initial Pump Scope (Chip or Group) for an instruction fetch ++event:0x14050 counters:0 um:zero minimum:10000 name:PM_INST_CHIP_PUMP_CPRED : Initial and Final Pump Scope and data sourced across this scope was chip pump (prediction=correct) for an instruction fetch. + event:0x2 counters:0,1,2,3 um:zero minimum:100000 name:PM_INST_CMPL : PPC Instructions Finished (completed). ++event:0x44048 counters:3 um:zero minimum:10000 name:PM_INST_FROM_DL2L3_MOD : The processor's Instruction cache was reloaded with Modified (M) data from another chip's L2 or L3 on a different Node or Group (Distant), as this chip due to either an instruction fetch or instruction fetch plus prefetch if MMCR1[17] is 1 . ++event:0x34048 counters:2 um:zero minimum:10000 name:PM_INST_FROM_DL2L3_SHR : The processor's Instruction cache was reloaded with Shared (S) data from another chip's L2 or L3 on a different Node or Group (Distant), as this chip due to either an instruction fetch or instruction fetch plus prefetch if MMCR1[17] is 1 . ++event:0x3404c counters:2 um:zero minimum:10000 name:PM_INST_FROM_DL4 : The processor's Instruction cache was reloaded from another chip's L4 on a different Node or Group (Distant) due to either an instruction fetch or instruction fetch plus prefetch if MMCR1[17] is 1 . ++event:0x4404c counters:3 um:zero minimum:10000 name:PM_INST_FROM_DMEM : The processor's Instruction cache was reloaded from another chip's memory on the same Node or Group (Distant) due to either an instruction fetch or instruction fetch plus prefetch if MMCR1[17] is 1 . ++event:0x4080 counters:0,1,2,3 um:zero minimum:10000 name:PM_INST_FROM_L1 : Instruction fetches from L1 ++event:0x14042 counters:0 um:zero minimum:10000 name:PM_INST_FROM_L2 : The processor's Instruction cache was reloaded from local core's L2 due to either an instruction fetch or instruction fetch plus prefetch if MMCR1[17] is 1 . ++event:0x44046 counters:3 um:zero minimum:10000 name:PM_INST_FROM_L21_MOD : The processor's Instruction cache was reloaded with Modified (M) data from another core's L2 on the same chip due to either an instruction fetch or instruction fetch plus prefetch if MMCR1[17] is 1 . ++event:0x34046 counters:2 um:zero minimum:10000 name:PM_INST_FROM_L21_SHR : The processor's Instruction cache was reloaded with Shared (S) data from another core's L2 on the same chip due to either an instruction fetch or instruction fetch plus prefetch if MMCR1[17] is 1 . ++event:0x1404e counters:0 um:zero minimum:10000 name:PM_INST_FROM_L2MISS : The processor's Instruction cache was reloaded from a localtion other than the local core's L2 due to either an instruction fetch or instruction fetch plus prefetch if MMCR1[17] is 1 . ++event:0x34040 counters:2 um:zero minimum:10000 name:PM_INST_FROM_L2_DISP_CONFLICT_LDHITST : The processor's Instruction cache was reloaded from local core's L2 with load hit store conflict due to either an instruction fetch or instruction fetch plus prefetch if MMCR1[17] is 1 . ++event:0x44040 counters:3 um:zero minimum:10000 name:PM_INST_FROM_L2_DISP_CONFLICT_OTHER : The processor's Instruction cache was reloaded from local core's L2 with dispatch conflict due to either an instruction fetch or instruction fetch plus prefetch if MMCR1[17] is 1 . ++event:0x24040 counters:1 um:zero minimum:10000 name:PM_INST_FROM_L2_MEPF : The processor's Instruction cache was reloaded from local core's L2 hit without dispatch conflicts on Mepf state. due to either an instruction fetch or instruction fetch plus prefetch if MMCR1[17] is 1 . ++event:0x14040 counters:0 um:zero minimum:10000 name:PM_INST_FROM_L2_NO_CONFLICT : The processor's Instruction cache was reloaded from local core's L2 without conflict due to either an instruction fetch or instruction fetch plus prefetch if MMCR1[17] is 1 . ++event:0x44042 counters:3 um:zero minimum:10000 name:PM_INST_FROM_L3 : The processor's Instruction cache was reloaded from local core's L3 due to either an instruction fetch or instruction fetch plus prefetch if MMCR1[17] is 1 . ++event:0x44044 counters:3 um:zero minimum:10000 name:PM_INST_FROM_L31_ECO_MOD : The processor's Instruction cache was reloaded with Modified (M) data from another core's ECO L3 on the same chip due to either an instruction fetch or instruction fetch plus prefetch if MMCR1[17] is 1 . ++event:0x34044 counters:2 um:zero minimum:10000 name:PM_INST_FROM_L31_ECO_SHR : The processor's Instruction cache was reloaded with Shared (S) data from another core's ECO L3 on the same chip due to either an instruction fetch or instruction fetch plus prefetch if MMCR1[17] is 1 . ++event:0x24044 counters:1 um:zero minimum:10000 name:PM_INST_FROM_L31_MOD : The processor's Instruction cache was reloaded with Modified (M) data from another core's L3 on the same chip due to either an instruction fetch or instruction fetch plus prefetch if MMCR1[17] is 1 . ++event:0x14046 counters:0 um:zero minimum:10000 name:PM_INST_FROM_L31_SHR : The processor's Instruction cache was reloaded with Shared (S) data from another core's L3 on the same chip due to either an instruction fetch or instruction fetch plus prefetch if MMCR1[17] is 1 . ++event:0x4404e counters:3 um:zero minimum:10000 name:PM_INST_FROM_L3MISS_MOD : The processor's Instruction cache was reloaded from a localtion other than the local core's L3 due to either an instruction fetch or instruction fetch plus prefetch if MMCR1[17] is 1 . ++event:0x34042 counters:2 um:zero minimum:10000 name:PM_INST_FROM_L3_DISP_CONFLICT : The processor's Instruction cache was reloaded from local core's L3 with dispatch conflict due to either an instruction fetch or instruction fetch plus prefetch if MMCR1[17] is 1 . ++event:0x24042 counters:1 um:zero minimum:10000 name:PM_INST_FROM_L3_MEPF : The processor's Instruction cache was reloaded from local core's L3 without dispatch conflicts hit on Mepf state. due to either an instruction fetch or instruction fetch plus prefetch if MMCR1[17] is 1 . ++event:0x14044 counters:0 um:zero minimum:10000 name:PM_INST_FROM_L3_NO_CONFLICT : The processor's Instruction cache was reloaded from local core's L3 without conflict due to either an instruction fetch or instruction fetch plus prefetch if MMCR1[17] is 1 . ++event:0x1404c counters:0 um:zero minimum:10000 name:PM_INST_FROM_LL4 : The processor's Instruction cache was reloaded from the local chip's L4 cache due to either an instruction fetch or instruction fetch plus prefetch if MMCR1[17] is 1 . ++event:0x24048 counters:1 um:zero minimum:10000 name:PM_INST_FROM_LMEM : The processor's Instruction cache was reloaded from the local chip's Memory due to either an instruction fetch or instruction fetch plus prefetch if MMCR1[17] is 1 . ++event:0x2404c counters:1 um:zero minimum:10000 name:PM_INST_FROM_MEMORY : The processor's Instruction cache was reloaded from a memory location including L4 from local remote or distant due to either an instruction fetch or instruction fetch plus prefetch if MMCR1[17] is 1 . ++event:0x4404a counters:3 um:zero minimum:10000 name:PM_INST_FROM_OFF_CHIP_CACHE : The processor's Instruction cache was reloaded either shared or modified data from another core's L2/L3 on a different chip (remote or distant) due to either an instruction fetch or instruction fetch plus prefetch if MMCR1[17] is 1 . ++event:0x14048 counters:0 um:zero minimum:10000 name:PM_INST_FROM_ON_CHIP_CACHE : The processor's Instruction cache was reloaded either shared or modified data from another core's L2/L3 on the same chip due to either an instruction fetch or instruction fetch plus prefetch if MMCR1[17] is 1 . ++event:0x24046 counters:1 um:zero minimum:10000 name:PM_INST_FROM_RL2L3_MOD : The processor's Instruction cache was reloaded with Modified (M) data from another chip's L2 or L3 on the same Node or Group (Remote), as this chip due to either an instruction fetch or instruction fetch plus prefetch if MMCR1[17] is 1 . ++event:0x1404a counters:0 um:zero minimum:10000 name:PM_INST_FROM_RL2L3_SHR : The processor's Instruction cache was reloaded with Shared (S) data from another chip's L2 or L3 on the same Node or Group (Remote), as this chip due to either an instruction fetch or instruction fetch plus prefetch if MMCR1[17] is 1 . ++event:0x2404a counters:1 um:zero minimum:10000 name:PM_INST_FROM_RL4 : The processor's Instruction cache was reloaded from another chip's L4 on the same Node or Group ( Remote) due to either an instruction fetch or instruction fetch plus prefetch if MMCR1[17] is 1 . ++event:0x3404a counters:2 um:zero minimum:10000 name:PM_INST_FROM_RMEM : The processor's Instruction cache was reloaded from another chip's memory on the same Node or Group ( Remote) due to either an instruction fetch or instruction fetch plus prefetch if MMCR1[17] is 1 . ++event:0x24050 counters:1 um:zero minimum:10000 name:PM_INST_GRP_PUMP_CPRED : Initial and Final Pump Scope and data sourced across this scope was group pump for an instruction fetch. ++event:0x24052 counters:1 um:zero minimum:10000 name:PM_INST_GRP_PUMP_MPRED : Final Pump Scope(Group) to get data sourced, ended up larger than Initial Pump Scope OR Final Pump Scope(Group) got data from source that was at smaller scope(Chip) Final pump was group pump and initial pump was chip or final and initial pump was gro ++event:0x14052 counters:0 um:zero minimum:10000 name:PM_INST_GRP_PUMP_MPRED_RTY : Final Pump Scope(Group) to get data sourced, ended up larger than Initial Pump Scope (Chip) Final pump was group pump and initial pump was chip pumpfor an instruction fetch. ++event:0x1003a counters:0 um:zero minimum:10000 name:PM_INST_IMC_MATCH_CMPL : IMC Match Count. ++event:0x30016 counters:2 um:zero minimum:10000 name:PM_INST_IMC_MATCH_DISP : IMC Matches dispatched. ++event:0x14054 counters:0 um:zero minimum:10000 name:PM_INST_PUMP_CPRED : Pump prediction correct. Counts across all types of pumpsfor an instruction fetch. ++event:0x44052 counters:3 um:zero minimum:10000 name:PM_INST_PUMP_MPRED : Pump Mis prediction Counts across all types of pumpsfor an instruction fetch. ++event:0x34050 counters:2 um:zero minimum:10000 name:PM_INST_SYS_PUMP_CPRED : Initial and Final Pump Scope and data sourced across this scope was system pump for an instruction fetch. ++event:0x34052 counters:2 um:zero minimum:10000 name:PM_INST_SYS_PUMP_MPRED : Final Pump Scope(system) to get data sourced, ended up larger than Initial Pump Scope(Chip/Group) OR Final Pump Scope(system) got data from source that was at smaller scope(Chip/group) Final pump was system pump and initial pump was chip or group or ++event:0x44050 counters:3 um:zero minimum:10000 name:PM_INST_SYS_PUMP_MPRED_RTY : Final Pump Scope(system) to get data sourced, ended up larger than Initial Pump Scope (Chip or Group) for an instruction fetch. + event:0x10014 counters:0 um:zero minimum:100000 name:PM_IOPS_CMPL : IOPS Completed. ++event:0x30014 counters:2 um:zero minimum:100000 name:PM_IOPS_DISP : IOPS dispatched. ++event:0x45048 counters:3 um:zero minimum:10000 name:PM_IPTEG_FROM_DL2L3_MOD : A Page Table Entry was loaded into the TLB with Modified (M) data from another chip's L2 or L3 on a different Node or Group (Distant), as this chip due to a instruction side request. ++event:0x35048 counters:2 um:zero minimum:10000 name:PM_IPTEG_FROM_DL2L3_SHR : A Page Table Entry was loaded into the TLB with Shared (S) data from another chip's L2 or L3 on a different Node or Group (Distant), as this chip due to a instruction side request. ++event:0x3504c counters:2 um:zero minimum:10000 name:PM_IPTEG_FROM_DL4 : A Page Table Entry was loaded into the TLB from another chip's L4 on a different Node or Group (Distant) due to a instruction side request. ++event:0x4504c counters:3 um:zero minimum:10000 name:PM_IPTEG_FROM_DMEM : A Page Table Entry was loaded into the TLB from another chip's memory on the same Node or Group (Distant) due to a instruction side request. ++event:0x15042 counters:0 um:zero minimum:10000 name:PM_IPTEG_FROM_L2 : A Page Table Entry was loaded into the TLB from local core's L2 due to a instruction side request. ++event:0x45046 counters:3 um:zero minimum:10000 name:PM_IPTEG_FROM_L21_MOD : A Page Table Entry was loaded into the TLB with Modified (M) data from another core's L2 on the same chip due to a instruction side request. ++event:0x35046 counters:2 um:zero minimum:10000 name:PM_IPTEG_FROM_L21_SHR : A Page Table Entry was loaded into the TLB with Shared (S) data from another core's L2 on the same chip due to a instruction side request. ++event:0x1504e counters:0 um:zero minimum:10000 name:PM_IPTEG_FROM_L2MISS : A Page Table Entry was loaded into the TLB from a localtion other than the local core's L2 due to a instruction side request. ++event:0x35040 counters:2 um:zero minimum:10000 name:PM_IPTEG_FROM_L2_DISP_CONFLICT_LDHITST : A Page Table Entry was loaded into the TLB from local core's L2 with load hit store conflict due to a instruction side request. ++event:0x45040 counters:3 um:zero minimum:10000 name:PM_IPTEG_FROM_L2_DISP_CONFLICT_OTHER : A Page Table Entry was loaded into the TLB from local core's L2 with dispatch conflict due to a instruction side request. ++event:0x25040 counters:1 um:zero minimum:10000 name:PM_IPTEG_FROM_L2_MEPF : A Page Table Entry was loaded into the TLB from local core's L2 hit without dispatch conflicts on Mepf state. due to a instruction side request. ++event:0x15040 counters:0 um:zero minimum:10000 name:PM_IPTEG_FROM_L2_NO_CONFLICT : A Page Table Entry was loaded into the TLB from local core's L2 without conflict due to a instruction side request. ++event:0x45042 counters:3 um:zero minimum:10000 name:PM_IPTEG_FROM_L3 : A Page Table Entry was loaded into the TLB from local core's L3 due to a instruction side request. ++event:0x45044 counters:3 um:zero minimum:10000 name:PM_IPTEG_FROM_L31_ECO_MOD : A Page Table Entry was loaded into the TLB with Modified (M) data from another core's ECO L3 on the same chip due to a instruction side request. ++event:0x35044 counters:2 um:zero minimum:10000 name:PM_IPTEG_FROM_L31_ECO_SHR : A Page Table Entry was loaded into the TLB with Shared (S) data from another core's ECO L3 on the same chip due to a instruction side request. ++event:0x25044 counters:1 um:zero minimum:10000 name:PM_IPTEG_FROM_L31_MOD : A Page Table Entry was loaded into the TLB with Modified (M) data from another core's L3 on the same chip due to a instruction side request. ++event:0x15046 counters:0 um:zero minimum:10000 name:PM_IPTEG_FROM_L31_SHR : A Page Table Entry was loaded into the TLB with Shared (S) data from another core's L3 on the same chip due to a instruction side request. ++event:0x4504e counters:3 um:zero minimum:10000 name:PM_IPTEG_FROM_L3MISS : A Page Table Entry was loaded into the TLB from a localtion other than the local core's L3 due to a instruction side request. ++event:0x35042 counters:2 um:zero minimum:10000 name:PM_IPTEG_FROM_L3_DISP_CONFLICT : A Page Table Entry was loaded into the TLB from local core's L3 with dispatch conflict due to a instruction side request. ++event:0x25042 counters:1 um:zero minimum:10000 name:PM_IPTEG_FROM_L3_MEPF : A Page Table Entry was loaded into the TLB from local core's L3 without dispatch conflicts hit on Mepf state. due to a instruction side request. ++event:0x15044 counters:0 um:zero minimum:10000 name:PM_IPTEG_FROM_L3_NO_CONFLICT : A Page Table Entry was loaded into the TLB from local core's L3 without conflict due to a instruction side request. ++event:0x1504c counters:0 um:zero minimum:10000 name:PM_IPTEG_FROM_LL4 : A Page Table Entry was loaded into the TLB from the local chip's L4 cache due to a instruction side request. ++event:0x25048 counters:1 um:zero minimum:10000 name:PM_IPTEG_FROM_LMEM : A Page Table Entry was loaded into the TLB from the local chip's Memory due to a instruction side request. ++event:0x2504c counters:1 um:zero minimum:10000 name:PM_IPTEG_FROM_MEMORY : A Page Table Entry was loaded into the TLB from a memory location including L4 from local remote or distant due to a instruction side request. ++event:0x4504a counters:3 um:zero minimum:10000 name:PM_IPTEG_FROM_OFF_CHIP_CACHE : A Page Table Entry was loaded into the TLB either shared or modified data from another core's L2/L3 on a different chip (remote or distant) due to a instruction side request. ++event:0x15048 counters:0 um:zero minimum:10000 name:PM_IPTEG_FROM_ON_CHIP_CACHE : A Page Table Entry was loaded into the TLB either shared or modified data from another core's L2/L3 on the same chip due to a instruction side request. ++event:0x25046 counters:1 um:zero minimum:10000 name:PM_IPTEG_FROM_RL2L3_MOD : A Page Table Entry was loaded into the TLB with Modified (M) data from another chip's L2 or L3 on the same Node or Group (Remote), as this chip due to a instruction side request. ++event:0x1504a counters:0 um:zero minimum:10000 name:PM_IPTEG_FROM_RL2L3_SHR : A Page Table Entry was loaded into the TLB with Shared (S) data from another chip's L2 or L3 on the same Node or Group (Remote), as this chip due to a instruction side request. ++event:0x2504a counters:1 um:zero minimum:10000 name:PM_IPTEG_FROM_RL4 : A Page Table Entry was loaded into the TLB from another chip's L4 on the same Node or Group ( Remote) due to a instruction side request. ++event:0x3504a counters:2 um:zero minimum:10000 name:PM_IPTEG_FROM_RMEM : A Page Table Entry was loaded into the TLB from another chip's memory on the same Node or Group ( Remote) due to a instruction side request. ++event:0x617082 counters:0 um:zero minimum:10000 name:PM_ISIDE_DISP : All i-side dispatch attempts ++event:0x627084 counters:1 um:zero minimum:10000 name:PM_ISIDE_DISP_FAIL : All i-side dispatch attempts that failed due to a addr collision with another machine ++event:0x627086 counters:1 um:zero minimum:10000 name:PM_ISIDE_DISP_FAIL_OTHER : All i-side dispatch attempts that failed due to a reason other than addrs collision ++event:0x4608e counters:3 um:zero minimum:10000 name:PM_ISIDE_L2MEMACC : valid when first beat of data comes in for an i-side fetch where data came from mem(or L4) ++event:0x44608e counters:3 um:zero minimum:10000 name:PM_ISIDE_MRU_TOUCH : Iside L2 MRU touch ++event:0xd096 counters:0,1,2,3 um:zero minimum:10000 name:PM_ISLB_MISS : I SLB Miss. ++event:0x30ac counters:0,1,2,3 um:zero minimum:10000 name:PM_ISU_REF_FX0 : FX0 ISU reject ++event:0x30ae counters:0,1,2,3 um:zero minimum:10000 name:PM_ISU_REF_FX1 : FX1 ISU reject ++event:0x38ac counters:0,1,2,3 um:zero minimum:10000 name:PM_ISU_REF_FXU : ISU ++event:0x30b0 counters:0,1,2,3 um:zero minimum:10000 name:PM_ISU_REF_LS0 : LS0 ISU reject ++event:0x30b2 counters:0,1,2,3 um:zero minimum:10000 name:PM_ISU_REF_LS1 : LS1 ISU reject ++event:0x30b4 counters:0,1,2,3 um:zero minimum:10000 name:PM_ISU_REF_LS2 : LS2 ISU reject ++event:0x30b6 counters:0,1,2,3 um:zero minimum:10000 name:PM_ISU_REF_LS3 : LS3 ISU reject ++event:0x309c counters:0,1,2,3 um:zero minimum:10000 name:PM_ISU_REJECTS_ALL : All isu rejects could be more than 1 per cycle ++event:0x30a2 counters:0,1,2,3 um:zero minimum:10000 name:PM_ISU_REJECT_RES_NA : ISU reject due to resource not available ++event:0x309e counters:0,1,2,3 um:zero minimum:10000 name:PM_ISU_REJECT_SAR_BYPASS : Reject because of SAR bypass ++event:0x30a0 counters:0,1,2,3 um:zero minimum:10000 name:PM_ISU_REJECT_SRC_NA : ISU reject due to source not available ++event:0x30a8 counters:0,1,2,3 um:zero minimum:10000 name:PM_ISU_REJ_VS0 : VS0 ISU reject ++event:0x30aa counters:0,1,2,3 um:zero minimum:10000 name:PM_ISU_REJ_VS1 : VS1 ISU reject ++event:0x38a8 counters:0,1,2,3 um:zero minimum:10000 name:PM_ISU_REJ_VSU : ISU ++event:0x30b8 counters:0,1,2,3 um:zero minimum:10000 name:PM_ISYNC : Isync count per thread ++event:0x200301ea counters:2 um:zero minimum:10000 name:PM_L1MISS_LAT_EXC_1024 : Reload latency exceeded 1024 cyc ++event:0x200401ec counters:3 um:zero minimum:10000 name:PM_L1MISS_LAT_EXC_2048 : Reload latency exceeded 2048 cyc ++event:0x200101e8 counters:0 um:zero minimum:10000 name:PM_L1MISS_LAT_EXC_256 : Reload latency exceeded 256 cyc ++event:0x200201e6 counters:1 um:zero minimum:10000 name:PM_L1MISS_LAT_EXC_32 : Reload latency exceeded 32 cyc ++event:0x26086 counters:1 um:zero minimum:10000 name:PM_L1PF_L2MEMACC : valid when first beat of data comes in for an L1pref where data came from mem(or L4) ++event:0x1002c counters:0 um:zero minimum:10000 name:PM_L1_DCACHE_RELOADED_ALL : L1 data cache reloaded for demand or prefetch . ++event:0x408c counters:0,1,2,3 um:zero minimum:10000 name:PM_L1_DEMAND_WRITE : Instruction Demand sectors wriittent into IL1 ++event:0x40012 counters:3 um:zero minimum:10000 name:PM_L1_ICACHE_RELOADED_ALL : Counts all Icache reloads includes demand, prefetchm prefetch turned into demand and demand turned into prefetch. ++event:0x30068 counters:2 um:zero minimum:10000 name:PM_L1_ICACHE_RELOADED_PREF : Counts all Icache prefetch reloads ( includes demand turned into prefetch). ++event:0x417080 counters:0 um:zero minimum:10000 name:PM_L2_CASTOUT_MOD : L2 Castouts - Modified (M, Mu, Me) ++event:0x417082 counters:0 um:zero minimum:10000 name:PM_L2_CASTOUT_SHR : L2 Castouts - Shared (T, Te, Si, S) ++event:0x27084 counters:1 um:zero minimum:10000 name:PM_L2_CHIP_PUMP : RC requests that were local on chip pump attempts ++event:0x427086 counters:1 um:zero minimum:10000 name:PM_L2_DC_INV : Dcache invalidates from L2 ++event:0x44608c counters:3 um:zero minimum:10000 name:PM_L2_DISP_ALL_L2MISS : All successful Ld/St dispatches for this thread that were an L2miss. ++event:0x64608e counters:3 um:zero minimum:10000 name:PM_L2_GROUP_PUMP : RC requests that were on Node Pump attempts ++event:0x626084 counters:1 um:zero minimum:10000 name:PM_L2_GRP_GUESS_CORRECT : L2 guess grp and guess was correct (data intra-6chip AND ^on-chip) ++event:0x626086 counters:1 um:zero minimum:10000 name:PM_L2_GRP_GUESS_WRONG : L2 guess grp and guess was not correct (ie data on-chip OR beyond-6chip) ++event:0x427084 counters:1 um:zero minimum:10000 name:PM_L2_IC_INV : Icache Invalidates from L2 ++event:0x436088 counters:2 um:zero minimum:10000 name:PM_L2_INST : All successful I-side dispatches for this thread (excludes i_l2mru_tch reqs) ++event:0x43608a counters:2 um:zero minimum:10000 name:PM_L2_INST_MISS : All successful i-side dispatches that were an L2miss for this thread (excludes i_l2mru_tch reqs) ++event:0x416080 counters:0 um:zero minimum:10000 name:PM_L2_LD : All successful D-side Load dispatches for this thread ++event:0x437088 counters:2 um:zero minimum:10000 name:PM_L2_LD_DISP : All successful load dispatches ++event:0x43708a counters:2 um:zero minimum:10000 name:PM_L2_LD_HIT : All successful load dispatches that were L2 hits ++event:0x426084 counters:1 um:zero minimum:10000 name:PM_L2_LD_MISS : All successful D-Side Load dispatches that were an L2miss for this thread ++event:0x616080 counters:0 um:zero minimum:10000 name:PM_L2_LOC_GUESS_CORRECT : L2 guess loc and guess was correct (ie data local) ++event:0x616082 counters:0 um:zero minimum:10000 name:PM_L2_LOC_GUESS_WRONG : L2 guess loc and guess was not correct (ie data not on chip) ++event:0x516080 counters:0 um:zero minimum:10000 name:PM_L2_RCLD_DISP : L2 RC load dispatch attempt ++event:0x516082 counters:0 um:zero minimum:10000 name:PM_L2_RCLD_DISP_FAIL_ADDR : L2 RC load dispatch attempt failed due to address collision with RC/CO/SN/SQ ++event:0x526084 counters:1 um:zero minimum:10000 name:PM_L2_RCLD_DISP_FAIL_OTHER : L2 RC load dispatch attempt failed due to other reasons ++event:0x536088 counters:2 um:zero minimum:10000 name:PM_L2_RCST_DISP : L2 RC store dispatch attempt ++event:0x53608a counters:2 um:zero minimum:10000 name:PM_L2_RCST_DISP_FAIL_ADDR : L2 RC store dispatch attempt failed due to address collision with RC/CO/SN/SQ ++event:0x54608c counters:3 um:zero minimum:10000 name:PM_L2_RCST_DISP_FAIL_OTHER : L2 RC store dispatch attempt failed due to other reasons ++event:0x537088 counters:2 um:zero minimum:10000 name:PM_L2_RC_ST_DONE : RC did st to line that was Tx or Sx ++event:0x63708a counters:2 um:zero minimum:10000 name:PM_L2_RTY_LD : RC retries on PB for any load from core ++event:0x3708a counters:2 um:zero minimum:10000 name:PM_L2_RTY_ST : RC retries on PB for any store from core ++event:0x54708c counters:3 um:zero minimum:10000 name:PM_L2_SN_M_RD_DONE : SNP dispatched for a read and was M ++event:0x54708e counters:3 um:zero minimum:10000 name:PM_L2_SN_M_WR_DONE : SNP dispatched for a write and was M ++event:0x53708a counters:2 um:zero minimum:10000 name:PM_L2_SN_SX_I_DONE : SNP dispatched and went from Sx or Tx to Ix ++event:0x17080 counters:0 um:zero minimum:10000 name:PM_L2_ST : All successful D-side store dispatches for this thread ++event:0x44708c counters:3 um:zero minimum:10000 name:PM_L2_ST_DISP : All successful store dispatches ++event:0x44708e counters:3 um:zero minimum:10000 name:PM_L2_ST_HIT : All successful store dispatches that were L2Hits ++event:0x17082 counters:0 um:zero minimum:10000 name:PM_L2_ST_MISS : All successful D-side store dispatches for this thread that were L2 Miss ++event:0x636088 counters:2 um:zero minimum:10000 name:PM_L2_SYS_GUESS_CORRECT : L2 guess sys and guess was correct (ie data beyond-6chip) ++event:0x63608a counters:2 um:zero minimum:10000 name:PM_L2_SYS_GUESS_WRONG : L2 guess sys and guess was not correct (ie data ^beyond-6chip) ++event:0x37088 counters:2 um:zero minimum:10000 name:PM_L2_SYS_PUMP : RC requests that were system pump attempts ++event:0x1e05e counters:0 um:zero minimum:10000 name:PM_L2_TM_REQ_ABORT : TM abort. ++event:0x3e05c counters:2 um:zero minimum:10000 name:PM_L2_TM_ST_ABORT_SISTER : TM marked store abort. ++event:0x23808a counters:2 um:zero minimum:10000 name:PM_L3_CINJ : l3 ci of cache inject ++event:0x128084 counters:1 um:zero minimum:10000 name:PM_L3_CI_HIT : L3 Castins Hit (total count ++event:0x128086 counters:1 um:zero minimum:10000 name:PM_L3_CI_MISS : L3 castins miss (total count ++event:0x819082 counters:0 um:zero minimum:10000 name:PM_L3_CI_USAGE : rotating sample of 16 CI or CO actives ++event:0x438088 counters:2 um:zero minimum:10000 name:PM_L3_CO : l3 castout occuring ( does not include casthrough or log writes (cinj/dmaw) ++event:0x83908b counters:2 um:zero minimum:10000 name:PM_L3_CO0_ALLOC : 0.0 ++event:0x83908a counters:2 um:zero minimum:10000 name:PM_L3_CO0_BUSY : lifetime, sample of CO machine 0 valid ++event:0x28086 counters:1 um:zero minimum:10000 name:PM_L3_CO_L31 : L3 CO to L3.1 OR of port 0 and 1 ( lossy) ++event:0x238088 counters:2 um:zero minimum:10000 name:PM_L3_CO_LCO : Total L3 castouts occurred on LCO ++event:0x28084 counters:1 um:zero minimum:10000 name:PM_L3_CO_MEM : L3 CO to memory OR of port 0 and 1 ( lossy) ++event:0x18082 counters:0 um:zero minimum:10000 name:PM_L3_CO_MEPF : L3 CO of line in Mep state ( includes casthrough ++event:0xb19082 counters:0 um:zero minimum:10000 name:PM_L3_GRP_GUESS_CORRECT : Initial scope=group and data from same group (near) (pred successful) ++event:0xb3908a counters:2 um:zero minimum:10000 name:PM_L3_GRP_GUESS_WRONG_HIGH : Initial scope=group but data from local node. Predition too high ++event:0xb39088 counters:2 um:zero minimum:10000 name:PM_L3_GRP_GUESS_WRONG_LOW : Initial scope=group but data from outside group (far or rem). Prediction too Low ++event:0x218080 counters:0 um:zero minimum:10000 name:PM_L3_HIT : L3 Hits ++event:0x138088 counters:2 um:zero minimum:10000 name:PM_L3_L2_CO_HIT : L2 castout hits ++event:0x13808a counters:2 um:zero minimum:10000 name:PM_L3_L2_CO_MISS : L2 castout miss ++event:0x14808c counters:3 um:zero minimum:10000 name:PM_L3_LAT_CI_HIT : L3 Lateral Castins Hit ++event:0x14808e counters:3 um:zero minimum:10000 name:PM_L3_LAT_CI_MISS : L3 Lateral Castins Miss ++event:0x228084 counters:1 um:zero minimum:10000 name:PM_L3_LD_HIT : L3 demand LD Hits ++event:0x228086 counters:1 um:zero minimum:10000 name:PM_L3_LD_MISS : L3 demand LD Miss ++event:0x1e052 counters:0 um:zero minimum:10000 name:PM_L3_LD_PREF : L3 Load Prefetches. ++event:0xb19080 counters:0 um:zero minimum:10000 name:PM_L3_LOC_GUESS_CORRECT : initial scope=node/chip and data from local node (local) (pred successful) ++event:0xb29086 counters:1 um:zero minimum:10000 name:PM_L3_LOC_GUESS_WRONG : Initial scope=node but data from out side local node (near or far or rem). Prediction too Low ++event:0x218082 counters:0 um:zero minimum:10000 name:PM_L3_MISS : L3 Misses ++event:0x54808c counters:3 um:zero minimum:10000 name:PM_L3_P0_CO_L31 : l3 CO to L3.1 (lco) port 0 ++event:0x538088 counters:2 um:zero minimum:10000 name:PM_L3_P0_CO_MEM : l3 CO to memory port 0 ++event:0x929084 counters:1 um:zero minimum:10000 name:PM_L3_P0_CO_RTY : L3 CO received retry port 0 ++event:0xa29084 counters:1 um:zero minimum:10000 name:PM_L3_P0_GRP_PUMP : L3 pf sent with grp scope port 0 ++event:0x528084 counters:1 um:zero minimum:10000 name:PM_L3_P0_LCO_DATA : lco sent with data port 0 ++event:0x518080 counters:0 um:zero minimum:10000 name:PM_L3_P0_LCO_NO_DATA : dataless l3 lco sent port 0 ++event:0xa4908c counters:3 um:zero minimum:10000 name:PM_L3_P0_LCO_RTY : L3 LCO received retry port 0 ++event:0xa19080 counters:0 um:zero minimum:10000 name:PM_L3_P0_NODE_PUMP : L3 pf sent with nodal scope port 0 ++event:0x919080 counters:0 um:zero minimum:10000 name:PM_L3_P0_PF_RTY : L3 PF received retry port 0 ++event:0x939088 counters:2 um:zero minimum:10000 name:PM_L3_P0_SN_HIT : L3 snoop hit port 0 ++event:0x118080 counters:0 um:zero minimum:10000 name:PM_L3_P0_SN_INV : Port0 snooper detects someone doing a store to a line thats Sx ++event:0x94908c counters:3 um:zero minimum:10000 name:PM_L3_P0_SN_MISS : L3 snoop miss port 0 ++event:0xa39088 counters:2 um:zero minimum:10000 name:PM_L3_P0_SYS_PUMP : L3 pf sent with sys scope port 0 ++event:0x54808e counters:3 um:zero minimum:10000 name:PM_L3_P1_CO_L31 : l3 CO to L3.1 (lco) port 1 ++event:0x53808a counters:2 um:zero minimum:10000 name:PM_L3_P1_CO_MEM : l3 CO to memory port 1 ++event:0x929086 counters:1 um:zero minimum:10000 name:PM_L3_P1_CO_RTY : L3 CO received retry port 1 ++event:0xa29086 counters:1 um:zero minimum:10000 name:PM_L3_P1_GRP_PUMP : L3 pf sent with grp scope port 1 ++event:0x528086 counters:1 um:zero minimum:10000 name:PM_L3_P1_LCO_DATA : lco sent with data port 1 ++event:0x518082 counters:0 um:zero minimum:10000 name:PM_L3_P1_LCO_NO_DATA : dataless l3 lco sent port 1 ++event:0xa4908e counters:3 um:zero minimum:10000 name:PM_L3_P1_LCO_RTY : L3 LCO received retry port 1 ++event:0xa19082 counters:0 um:zero minimum:10000 name:PM_L3_P1_NODE_PUMP : L3 pf sent with nodal scope port 1 ++event:0x919082 counters:0 um:zero minimum:10000 name:PM_L3_P1_PF_RTY : L3 PF received retry port 1 ++event:0x93908a counters:2 um:zero minimum:10000 name:PM_L3_P1_SN_HIT : L3 snoop hit port 1 ++event:0x118082 counters:0 um:zero minimum:10000 name:PM_L3_P1_SN_INV : Port1 snooper detects someone doing a store to a line thats Sx ++event:0x94908e counters:3 um:zero minimum:10000 name:PM_L3_P1_SN_MISS : L3 snoop miss port 1 ++event:0xa3908a counters:2 um:zero minimum:10000 name:PM_L3_P1_SYS_PUMP : L3 pf sent with sys scope port 1 ++event:0x84908d counters:3 um:zero minimum:10000 name:PM_L3_PF0_ALLOC : 0.0 ++event:0x84908c counters:3 um:zero minimum:10000 name:PM_L3_PF0_BUSY : lifetime, sample of PF machine 0 valid ++event:0x428084 counters:1 um:zero minimum:10000 name:PM_L3_PF_HIT_L3 : l3 pf hit in l3 ++event:0x18080 counters:0 um:zero minimum:10000 name:PM_L3_PF_MISS_L3 : L3 Prefetch missed in L3 ++event:0x3808a counters:2 um:zero minimum:10000 name:PM_L3_PF_OFF_CHIP_CACHE : L3 Prefetch from Off chip cache ++event:0x4808e counters:3 um:zero minimum:10000 name:PM_L3_PF_OFF_CHIP_MEM : L3 Prefetch from Off chip memory ++event:0x38088 counters:2 um:zero minimum:10000 name:PM_L3_PF_ON_CHIP_CACHE : L3 Prefetch from On chip cache ++event:0x4808c counters:3 um:zero minimum:10000 name:PM_L3_PF_ON_CHIP_MEM : L3 Prefetch from On chip memory ++event:0x829084 counters:1 um:zero minimum:10000 name:PM_L3_PF_USAGE : rotating sample of 32 PF actives ++event:0x4e052 counters:3 um:zero minimum:10000 name:PM_L3_PREF_ALL : Total HW L3 prefetches(Load+store). ++event:0x84908f counters:3 um:zero minimum:10000 name:PM_L3_RD0_ALLOC : 0.0 ++event:0x84908e counters:3 um:zero minimum:10000 name:PM_L3_RD0_BUSY : lifetime, sample of RD machine 0 valid ++event:0x829086 counters:1 um:zero minimum:10000 name:PM_L3_RD_USAGE : rotating sample of 16 RD actives ++event:0x839089 counters:2 um:zero minimum:10000 name:PM_L3_SN0_ALLOC : 0.0 ++event:0x839088 counters:2 um:zero minimum:10000 name:PM_L3_SN0_BUSY : lifetime, sample of snooper machine 0 valid ++event:0x819080 counters:0 um:zero minimum:10000 name:PM_L3_SN_USAGE : rotating sample of 8 snoop valids ++event:0x2e052 counters:1 um:zero minimum:10000 name:PM_L3_ST_PREF : L3 store Prefetches. ++event:0x3e052 counters:2 um:zero minimum:10000 name:PM_L3_SW_PREF : Data stream touchto L3. ++event:0xb29084 counters:1 um:zero minimum:10000 name:PM_L3_SYS_GUESS_CORRECT : Initial scope=system and data from outside group (far or rem)(pred successful) ++event:0xb4908c counters:3 um:zero minimum:10000 name:PM_L3_SYS_GUESS_WRONG : Initial scope=system but data from local or near. Predction too high ++event:0x24808e counters:3 um:zero minimum:10000 name:PM_L3_TRANS_PF : L3 Transient prefetch ++event:0x18081 counters:0 um:zero minimum:10000 name:PM_L3_WI0_ALLOC : 0.0 ++event:0x418080 counters:0 um:zero minimum:10000 name:PM_L3_WI0_BUSY : lifetime, sample of Write Inject machine 0 valid ++event:0x418082 counters:0 um:zero minimum:10000 name:PM_L3_WI_USAGE : rotating sample of 8 WI actives ++event:0x3c058 counters:2 um:zero minimum:10000 name:PM_LARX_FIN : Larx finished . + event:0x1002e counters:0 um:zero minimum:10000 name:PM_LD_CMPL : count of Loads completed. + event:0x10062 counters:0 um:zero minimum:10000 name:PM_LD_L3MISS_PEND_CYC : Cycles L3 miss was pending for this thread. ++event:0x100ee counters:0 um:zero minimum:10000 name:PM_LD_REF_L1 : Load Ref count combined for all units. ++event:0xc080 counters:0,1,2,3 um:zero minimum:10000 name:PM_LD_REF_L1_LSU0 : LS0 L1 D cache load references counted at finish, gated by rejectLSU0 L1 D cache load references ++event:0xc082 counters:0,1,2,3 um:zero minimum:10000 name:PM_LD_REF_L1_LSU1 : LS1 L1 D cache load references counted at finish, gated by rejectLSU1 L1 D cache load references ++event:0xc094 counters:0,1,2,3 um:zero minimum:10000 name:PM_LD_REF_L1_LSU2 : LS2 L1 D cache load references counted at finish, gated by reject42 ++event:0xc096 counters:0,1,2,3 um:zero minimum:10000 name:PM_LD_REF_L1_LSU3 : LS3 L1 D cache load references counted at finish, gated by reject42 ++event:0x509a counters:0,1,2,3 um:zero minimum:10000 name:PM_LINK_STACK_INVALID_PTR : A flush were LS ptr is invalid, results in a pop , A lot of interrupts between push and pops ++event:0x5098 counters:0,1,2,3 um:zero minimum:10000 name:PM_LINK_STACK_WRONG_ADD_PRED : Link stack predicts wrong address, because of link stack design limitation. ++event:0xe080 counters:0,1,2,3 um:zero minimum:10000 name:PM_LS0_ERAT_MISS_PREF : LS0 Erat miss due to prefetch42 ++event:0xd0b8 counters:0,1,2,3 um:zero minimum:10000 name:PM_LS0_L1_PREF : LS0 L1 cache data prefetches42 ++event:0xc098 counters:0,1,2,3 um:zero minimum:10000 name:PM_LS0_L1_SW_PREF : Software L1 Prefetches, including SW Transient Prefetches42 ++event:0xe082 counters:0,1,2,3 um:zero minimum:10000 name:PM_LS1_ERAT_MISS_PREF : LS1 Erat miss due to prefetch42 ++event:0xd0ba counters:0,1,2,3 um:zero minimum:10000 name:PM_LS1_L1_PREF : LS1 L1 cache data prefetches42 ++event:0xc09a counters:0,1,2,3 um:zero minimum:10000 name:PM_LS1_L1_SW_PREF : Software L1 Prefetches, including SW Transient Prefetches42 ++event:0xc0b0 counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU0_FLUSH_LRQ : LS0 Flush: LRQLSU0 LRQ flushes ++event:0xc0b8 counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU0_FLUSH_SRQ : LS0 Flush: SRQLSU0 SRQ lhs flushes ++event:0xc0a4 counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU0_FLUSH_ULD : LS0 Flush: Unaligned LoadLSU0 unaligned load flushes ++event:0xc0ac counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU0_FLUSH_UST : LS0 Flush: Unaligned StoreLSU0 unaligned store flushes ++event:0xf088 counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU0_L1_CAM_CANCEL : ls0 l1 tm cam cancel42 ++event:0x1e056 counters:0 um:zero minimum:10000 name:PM_LSU0_LARX_FIN : . ++event:0xd08c counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU0_LMQ_LHR_MERGE : LS0 Load Merged with another cacheline request42 ++event:0xc08c counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU0_NCLD : LS0 Non-cachable Loads counted at finishLSU0 non-cacheable loads ++event:0xe090 counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU0_PRIMARY_ERAT_HIT : Primary ERAT hit42 ++event:0x1e05a counters:0 um:zero minimum:10000 name:PM_LSU0_REJECT : LSU0 reject . ++event:0xc09c counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU0_SRQ_STFWD : LS0 SRQ forwarded data to a loadLSU0 SRQ store forwarded ++event:0xf084 counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU0_STORE_REJECT : ls0 store reject42 ++event:0xe0a8 counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU0_TMA_REQ_L2 : addrs only req to L2 only on the first one,Indication that Load footprint is not expanding42 ++event:0xe098 counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU0_TM_L1_HIT : Load tm hit in L142 ++event:0xe0a0 counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU0_TM_L1_MISS : Load tm L1 miss42 ++event:0xc0b2 counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU1_FLUSH_LRQ : LS1 Flush: LRQLSU1 LRQ flushes ++event:0xc0ba counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU1_FLUSH_SRQ : LS1 Flush: SRQLSU1 SRQ lhs flushes ++event:0xc0a6 counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU1_FLUSH_ULD : LS 1 Flush: Unaligned LoadLSU1 unaligned load flushes ++event:0xc0ae counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU1_FLUSH_UST : LS1 Flush: Unaligned StoreLSU1 unaligned store flushes ++event:0xf08a counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU1_L1_CAM_CANCEL : ls1 l1 tm cam cancel42 ++event:0x2e056 counters:1 um:zero minimum:10000 name:PM_LSU1_LARX_FIN : Larx finished in LSU pipe1. ++event:0xd08e counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU1_LMQ_LHR_MERGE : LS1 Load Merge with another cacheline request42 ++event:0xc08e counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU1_NCLD : LS1 Non-cachable Loads counted at finishLSU1 non-cacheable loads ++event:0xe092 counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU1_PRIMARY_ERAT_HIT : Primary ERAT hit42 ++event:0x2e05a counters:1 um:zero minimum:10000 name:PM_LSU1_REJECT : LSU1 reject . ++event:0xc09e counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU1_SRQ_STFWD : LS1 SRQ forwarded data to a loadLSU1 SRQ store forwarded ++event:0xf086 counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU1_STORE_REJECT : ls1 store reject42 ++event:0xe0aa counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU1_TMA_REQ_L2 : addrs only req to L2 only on the first one,Indication that Load footprint is not expanding42 ++event:0xe09a counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU1_TM_L1_HIT : Load tm hit in L142 ++event:0xe0a2 counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU1_TM_L1_MISS : Load tm L1 miss42 ++event:0xc0b4 counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU2_FLUSH_LRQ : LS02Flush: LRQ42 ++event:0xc0bc counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU2_FLUSH_SRQ : LS2 Flush: SRQ42 ++event:0xc0a8 counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU2_FLUSH_ULD : LS3 Flush: Unaligned Load42 ++event:0xf08c counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU2_L1_CAM_CANCEL : ls2 l1 tm cam cancel42 ++event:0x3e056 counters:2 um:zero minimum:10000 name:PM_LSU2_LARX_FIN : Larx finished in LSU pipe2. ++event:0xc084 counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU2_LDF : LS2 Scalar Loads42 ++event:0xc088 counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU2_LDX : LS0 Vector Loads42 ++event:0xd090 counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU2_LMQ_LHR_MERGE : LS0 Load Merged with another cacheline request42 ++event:0xe094 counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU2_PRIMARY_ERAT_HIT : Primary ERAT hit42 ++event:0x3e05a counters:2 um:zero minimum:10000 name:PM_LSU2_REJECT : LSU2 reject . ++event:0xc0a0 counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU2_SRQ_STFWD : LS2 SRQ forwarded data to a load42 ++event:0xe0ac counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU2_TMA_REQ_L2 : addrs only req to L2 only on the first one,Indication that Load footprint is not expanding42 ++event:0xe09c counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU2_TM_L1_HIT : Load tm hit in L142 ++event:0xe0a4 counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU2_TM_L1_MISS : Load tm L1 miss42 ++event:0xc0b6 counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU3_FLUSH_LRQ : LS3 Flush: LRQ42 ++event:0xc0be counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU3_FLUSH_SRQ : LS13 Flush: SRQ42 ++event:0xc0aa counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU3_FLUSH_ULD : LS 14Flush: Unaligned Load42 ++event:0xf08e counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU3_L1_CAM_CANCEL : ls3 l1 tm cam cancel42 ++event:0x4e056 counters:3 um:zero minimum:10000 name:PM_LSU3_LARX_FIN : Larx finished in LSU pipe3. ++event:0xc086 counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU3_LDF : LS3 Scalar Loads 42 ++event:0xc08a counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU3_LDX : LS1 Vector Loads42 ++event:0xd092 counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU3_LMQ_LHR_MERGE : LS1 Load Merge with another cacheline request42 ++event:0xe096 counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU3_PRIMARY_ERAT_HIT : Primary ERAT hit42 ++event:0x4e05a counters:3 um:zero minimum:10000 name:PM_LSU3_REJECT : LSU3 reject . ++event:0xc0a2 counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU3_SRQ_STFWD : LS3 SRQ forwarded data to a load42 ++event:0xe0ae counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU3_TMA_REQ_L2 : addrs only req to L2 only on the first one,Indication that Load footprint is not expanding42 ++event:0xe09e counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU3_TM_L1_HIT : Load tm hit in L142 ++event:0xe0a6 counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU3_TM_L1_MISS : Load tm L1 miss42 ++event:0xe880 counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU_ERAT_MISS_PREF : LSU ++event:0x30066 counters:2 um:zero minimum:10000 name:PM_LSU_FIN : LSU Finished an instruction (up to 2 per cycle). ++event:0xc8ac counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU_FLUSH_UST : LSU ++event:0xd0a4 counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU_FOUR_TABLEWALK_CYC : Cycles when four tablewalks pending on this thread42 ++event:0x10066 counters:0 um:zero minimum:10000 name:PM_LSU_FX_FIN : LSU Finished a FX operation (up to 2 per cycle. ++event:0xd8b8 counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU_L1_PREF : LSU ++event:0xc898 counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU_L1_SW_PREF : LSU ++event:0xc884 counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU_LDF : LSU ++event:0xc888 counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU_LDX : LSU ++event:0xd0a2 counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU_LMQ_FULL_CYC : LMQ fullCycles LMQ full, ++event:0xd0a1 counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU_LMQ_S0_ALLOC : 0.0 ++event:0xd0a0 counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU_LMQ_S0_VALID : Slot 0 of LMQ validLMQ slot 0 valid ++event:0x3001c counters:2 um:zero minimum:10000 name:PM_LSU_LMQ_SRQ_EMPTY_ALL_CYC : ALL threads lsu empty (lmq and srq empty). Issue HW016541 ++event:0x2003e counters:1 um:zero minimum:10000 name:PM_LSU_LMQ_SRQ_EMPTY_CYC : LSU empty (lmq and srq empty). ++event:0xd09f counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU_LRQ_S0_ALLOC : 0.0 ++event:0xd09e counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU_LRQ_S0_VALID : Slot 0 of LRQ validLRQ slot 0 valid ++event:0xf091 counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU_LRQ_S43_ALLOC : 0.0 ++event:0xf090 counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU_LRQ_S43_VALID : LRQ slot 43 was busy42 ++event:0x30162 counters:2 um:zero minimum:10000 name:PM_LSU_MRK_DERAT_MISS : DERAT Reloaded (Miss). ++event:0xc88c counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU_NCLD : LSU ++event:0xc092 counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU_NCST : Non-cachable Stores sent to nest42 ++event:0x10064 counters:0 um:zero minimum:10000 name:PM_LSU_REJECT : LSU Reject (up to 4 per cycle). ++event:0x2e05c counters:1 um:zero minimum:10000 name:PM_LSU_REJECT_ERAT_MISS : LSU Reject due to ERAT (up to 4 per cycles). ++event:0x4e05c counters:3 um:zero minimum:10000 name:PM_LSU_REJECT_LHS : LSU Reject due to LHS (up to 4 per cycle). ++event:0x1e05c counters:0 um:zero minimum:10000 name:PM_LSU_REJECT_LMQ_FULL : LSU reject due to LMQ full ( 4 per cycle). ++event:0xd082 counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU_SET_MPRED : Line already in cache at reload time42 ++event:0x40008 counters:3 um:zero minimum:10000 name:PM_LSU_SRQ_EMPTY_CYC : All threads srq empty. ++event:0x1001a counters:0 um:zero minimum:10000 name:PM_LSU_SRQ_FULL_CYC : SRQ is Full. ++event:0xd09d counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU_SRQ_S0_ALLOC : 0.0 ++event:0xd09c counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU_SRQ_S0_VALID : Slot 0 of SRQ validSRQ slot 0 valid ++event:0xf093 counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU_SRQ_S39_ALLOC : 0.0 ++event:0xf092 counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU_SRQ_S39_VALID : SRQ slot 39 was busy42 ++event:0xd09b counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU_SRQ_SYNC : 0.0 ++event:0xd09a counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU_SRQ_SYNC_CYC : A sync is in the SRQ (edge detect to count)SRQ sync duration ++event:0xf084 counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU_STORE_REJECT : LSU ++event:0xd0a6 counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU_TWO_TABLEWALK_CYC : Cycles when two tablewalks pending on this thread42 ++event:0x5094 counters:0,1,2,3 um:zero minimum:10000 name:PM_LWSYNC : threaded version, IC Misses where we got EA dir hit but no sector valids were on. ICBI took line out ++event:0x209a counters:0,1,2,3 um:zero minimum:10000 name:PM_LWSYNC_HELD : LWSYNC held at dispatch ++event:0x4c058 counters:3 um:zero minimum:10000 name:PM_MEM_CO : Memory castouts from this lpar. ++event:0x10058 counters:0 um:zero minimum:10000 name:PM_MEM_LOC_THRESH_IFU : Local Memory above threshold for IFU speculation control. ++event:0x40056 counters:3 um:zero minimum:10000 name:PM_MEM_LOC_THRESH_LSU_HIGH : Local memory above threshold for LSU medium. ++event:0x1c05e counters:0 um:zero minimum:10000 name:PM_MEM_LOC_THRESH_LSU_MED : Local memory above theshold for data prefetch. ++event:0x2c058 counters:1 um:zero minimum:10000 name:PM_MEM_PREF : Memory prefetch for this lpar. ++event:0x10056 counters:0 um:zero minimum:10000 name:PM_MEM_READ : Reads from Memory from this lpar (includes data/inst/xlate/l1prefetch/inst prefetch). ++event:0x3c05e counters:2 um:zero minimum:10000 name:PM_MEM_RWITM : Memory rwitm for this lpar. ++event:0x3515e counters:2 um:zero minimum:1000 name:PM_MRK_BACK_BR_CMPL : Marked branch instruction completed with a target address less than current instruction address. ++event:0x2013a counters:1 um:zero minimum:1000 name:PM_MRK_BRU_FIN : bru marked instr finish. ++event:0x1016e counters:0 um:zero minimum:1000 name:PM_MRK_BR_CMPL : Branch Instruction completed. ++event:0x3013a counters:2 um:zero minimum:1000 name:PM_MRK_CRU_FIN : IFU non-branch marked instruction finished. ++event:0x4d148 counters:3 um:zero minimum:1000 name:PM_MRK_DATA_FROM_DL2L3_MOD : The processor's data cache was reloaded with Modified (M) data from another chip's L2 or L3 on a different Node or Group (Distant), as this chip due to a marked load. ++event:0x2d128 counters:1 um:zero minimum:1000 name:PM_MRK_DATA_FROM_DL2L3_MOD_CYC : Duration in cycles to reload with Modified (M) data from another chip's L2 or L3 on a different Node or Group (Distant), as this chip due to a marked load. ++event:0x3d148 counters:2 um:zero minimum:1000 name:PM_MRK_DATA_FROM_DL2L3_SHR : The processor's data cache was reloaded with Shared (S) data from another chip's L2 or L3 on a different Node or Group (Distant), as this chip due to a marked load. ++event:0x2c128 counters:1 um:zero minimum:1000 name:PM_MRK_DATA_FROM_DL2L3_SHR_CYC : Duration in cycles to reload with Shared (S) data from another chip's L2 or L3 on a different Node or Group (Distant), as this chip due to a marked load. ++event:0x3d14c counters:2 um:zero minimum:1000 name:PM_MRK_DATA_FROM_DL4 : The processor's data cache was reloaded from another chip's L4 on a different Node or Group (Distant) due to a marked load. ++event:0x2c12c counters:1 um:zero minimum:1000 name:PM_MRK_DATA_FROM_DL4_CYC : Duration in cycles to reload from another chip's L4 on a different Node or Group (Distant) due to a marked load. ++event:0x4d14c counters:3 um:zero minimum:1000 name:PM_MRK_DATA_FROM_DMEM : The processor's data cache was reloaded from another chip's memory on the same Node or Group (Distant) due to a marked load. ++event:0x2d12c counters:1 um:zero minimum:1000 name:PM_MRK_DATA_FROM_DMEM_CYC : Duration in cycles to reload from another chip's memory on the same Node or Group (Distant) due to a marked load. + event:0x1d142 counters:0 um:zero minimum:1000 name:PM_MRK_DATA_FROM_L2 : The processor's data cache was reloaded from local core's L2 due to a marked load. +-event:0x4c12e counters:3 um:zero minimum:1000 name:PM_MRK_DATA_FROM_L2MISS_CYC : Duration in cycles to reload from a localtion other than the local core's L2 due to a marked load. +-event:0x4c122 counters:3 um:zero minimum:1000 name:PM_MRK_DATA_FROM_L2_CYC : Duration in cycles to reload from local core's L2 due to a marked load. ++event:0x4d146 counters:3 um:zero minimum:1000 name:PM_MRK_DATA_FROM_L21_MOD : The processor's data cache was reloaded with Modified (M) data from another core's L2 on the same chip due to a marked load. ++event:0x2d126 counters:1 um:zero minimum:1000 name:PM_MRK_DATA_FROM_L21_MOD_CYC : Duration in cycles to reload with Modified (M) data from another core's L2 on the same chip due to a marked load. ++event:0x3d146 counters:2 um:zero minimum:1000 name:PM_MRK_DATA_FROM_L21_SHR : The processor's data cache was reloaded with Shared (S) data from another core's L2 on the same chip due to a marked load. ++event:0x2c126 counters:1 um:zero minimum:1000 name:PM_MRK_DATA_FROM_L21_SHR_CYC : Duration in cycles to reload with Shared (S) data from another core's L2 on the same chip due to a marked load. ++event:0x4c12e counters:3 um:zero minimum:1000 name:PM_MRK_DATA_FROM_L2MISS_CYC : Duration in cycles to reload from a localtion other than the local core's L2 due to a marked load. ++event:0x4c122 counters:3 um:zero minimum:1000 name:PM_MRK_DATA_FROM_L2_CYC : Duration in cycles to reload from local core's L2 due to a marked load. ++event:0x3d140 counters:2 um:zero minimum:1000 name:PM_MRK_DATA_FROM_L2_DISP_CONFLICT_LDHITST : The processor's data cache was reloaded from local core's L2 with load hit store conflict due to a marked load. ++event:0x2c120 counters:1 um:zero minimum:1000 name:PM_MRK_DATA_FROM_L2_DISP_CONFLICT_LDHITST_CYC : Duration in cycles to reload from local core's L2 with load hit store conflict due to a marked load. ++event:0x4d140 counters:3 um:zero minimum:1000 name:PM_MRK_DATA_FROM_L2_DISP_CONFLICT_OTHER : The processor's data cache was reloaded from local core's L2 with dispatch conflict due to a marked load. ++event:0x2d120 counters:1 um:zero minimum:1000 name:PM_MRK_DATA_FROM_L2_DISP_CONFLICT_OTHER_CYC : Duration in cycles to reload from local core's L2 with dispatch conflict due to a marked load. ++event:0x2d140 counters:1 um:zero minimum:1000 name:PM_MRK_DATA_FROM_L2_MEPF : The processor's data cache was reloaded from local core's L2 hit without dispatch conflicts on Mepf state. due to a marked load. ++event:0x4d120 counters:3 um:zero minimum:1000 name:PM_MRK_DATA_FROM_L2_MEPF_CYC : Duration in cycles to reload from local core's L2 hit without dispatch conflicts on Mepf state. due to a marked load. + event:0x1d140 counters:0 um:zero minimum:1000 name:PM_MRK_DATA_FROM_L2_NO_CONFLICT : The processor's data cache was reloaded from local core's L2 without conflict due to a marked load. +-event:0x4c120 counters:3 um:zero minimum:1000 name:PM_MRK_DATA_FROM_L2_NO_CONFLICT_CYC : Duration in cycles to reload from local core's L2 without conflict due to a marked load. ++event:0x4c120 counters:3 um:zero minimum:1000 name:PM_MRK_DATA_FROM_L2_NO_CONFLICT_CYC : Duration in cycles to reload from local core's L2 without conflict due to a marked load. + event:0x4d142 counters:3 um:zero minimum:1000 name:PM_MRK_DATA_FROM_L3 : The processor's data cache was reloaded from local core's L3 due to a marked load. +-event:0x2d12e counters:1 um:zero minimum:1000 name:PM_MRK_DATA_FROM_L3MISS_CYC : Duration in cycles to reload from a localtion other than the local core's L3 due to a marked load. +-event:0x2d122 counters:1 um:zero minimum:1000 name:PM_MRK_DATA_FROM_L3_CYC : Duration in cycles to reload from local core's L3 due to a marked load. ++event:0x4d144 counters:3 um:zero minimum:1000 name:PM_MRK_DATA_FROM_L31_ECO_MOD : The processor's data cache was reloaded with Modified (M) data from another core's ECO L3 on the same chip due to a marked load. ++event:0x2d124 counters:1 um:zero minimum:1000 name:PM_MRK_DATA_FROM_L31_ECO_MOD_CYC : Duration in cycles to reload with Modified (M) data from another core's ECO L3 on the same chip due to a marked load. ++event:0x3d144 counters:2 um:zero minimum:1000 name:PM_MRK_DATA_FROM_L31_ECO_SHR : The processor's data cache was reloaded with Shared (S) data from another core's ECO L3 on the same chip due to a marked load. ++event:0x2c124 counters:1 um:zero minimum:1000 name:PM_MRK_DATA_FROM_L31_ECO_SHR_CYC : Duration in cycles to reload with Shared (S) data from another core's ECO L3 on the same chip due to a marked load. ++event:0x2d144 counters:1 um:zero minimum:1000 name:PM_MRK_DATA_FROM_L31_MOD : The processor's data cache was reloaded with Modified (M) data from another core's L3 on the same chip due to a marked load. ++event:0x4d124 counters:3 um:zero minimum:1000 name:PM_MRK_DATA_FROM_L31_MOD_CYC : Duration in cycles to reload with Modified (M) data from another core's L3 on the same chip due to a marked load. ++event:0x1d146 counters:0 um:zero minimum:1000 name:PM_MRK_DATA_FROM_L31_SHR : The processor's data cache was reloaded with Shared (S) data from another core's L3 on the same chip due to a marked load. ++event:0x4c126 counters:3 um:zero minimum:1000 name:PM_MRK_DATA_FROM_L31_SHR_CYC : Duration in cycles to reload with Shared (S) data from another core's L3 on the same chip due to a marked load. ++event:0x2d12e counters:1 um:zero minimum:1000 name:PM_MRK_DATA_FROM_L3MISS_CYC : Duration in cycles to reload from a localtion other than the local core's L3 due to a marked load. ++event:0x2d122 counters:1 um:zero minimum:1000 name:PM_MRK_DATA_FROM_L3_CYC : Duration in cycles to reload from local core's L3 due to a marked load. ++event:0x3d142 counters:2 um:zero minimum:1000 name:PM_MRK_DATA_FROM_L3_DISP_CONFLICT : The processor's data cache was reloaded from local core's L3 with dispatch conflict due to a marked load. ++event:0x2c122 counters:1 um:zero minimum:1000 name:PM_MRK_DATA_FROM_L3_DISP_CONFLICT_CYC : Duration in cycles to reload from local core's L3 with dispatch conflict due to a marked load. ++event:0x2d142 counters:1 um:zero minimum:1000 name:PM_MRK_DATA_FROM_L3_MEPF : The processor's data cache was reloaded from local core's L3 without dispatch conflicts hit on Mepf state. due to a marked load. ++event:0x4d122 counters:3 um:zero minimum:1000 name:PM_MRK_DATA_FROM_L3_MEPF_CYC : Duration in cycles to reload from local core's L3 without dispatch conflicts hit on Mepf state. due to a marked load. + event:0x1d144 counters:0 um:zero minimum:1000 name:PM_MRK_DATA_FROM_L3_NO_CONFLICT : The processor's data cache was reloaded from local core's L3 without conflict due to a marked load. +-event:0x4c124 counters:3 um:zero minimum:1000 name:PM_MRK_DATA_FROM_L3_NO_CONFLICT_CYC : Duration in cycles to reload from local core's L3 without conflict due to a marked load. +-event:0x1d14c counters:0 um:zero minimum:1000 name:PM_MRK_DATA_FROM_LL4 : The processor's data cache was reloaded from the local chip's L4 cache due to a marked load. +-event:0x4c12c counters:3 um:zero minimum:1000 name:PM_MRK_DATA_FROM_LL4_CYC : Duration in cycles to reload from the local chip's L4 cache due to a marked load. +-event:0x2d148 counters:1 um:zero minimum:1000 name:PM_MRK_DATA_FROM_LMEM : The processor's data cache was reloaded from the local chip's Memory due to a marked load. +-event:0x4d128 counters:3 um:zero minimum:1000 name:PM_MRK_DATA_FROM_LMEM_CYC : Duration in cycles to reload from the local chip's Memory due to a marked load. ++event:0x4c124 counters:3 um:zero minimum:1000 name:PM_MRK_DATA_FROM_L3_NO_CONFLICT_CYC : Duration in cycles to reload from local core's L3 without conflict due to a marked load. ++event:0x1d14c counters:0 um:zero minimum:1000 name:PM_MRK_DATA_FROM_LL4 : The processor's data cache was reloaded from the local chip's L4 cache due to a marked load. ++event:0x4c12c counters:3 um:zero minimum:1000 name:PM_MRK_DATA_FROM_LL4_CYC : Duration in cycles to reload from the local chip's L4 cache due to a marked load. ++event:0x2d148 counters:1 um:zero minimum:1000 name:PM_MRK_DATA_FROM_LMEM : The processor's data cache was reloaded from the local chip's Memory due to a marked load. ++event:0x4d128 counters:3 um:zero minimum:1000 name:PM_MRK_DATA_FROM_LMEM_CYC : Duration in cycles to reload from the local chip's Memory due to a marked load. + event:0x2d14c counters:1 um:zero minimum:1000 name:PM_MRK_DATA_FROM_MEMORY : The processor's data cache was reloaded from a memory location including L4 from local remote or distant due to a marked load. +-event:0x4d12c counters:3 um:zero minimum:1000 name:PM_MRK_DATA_FROM_MEMORY_CYC : Duration in cycles to reload from a memory location including L4 from local remote or distant due to a marked load. ++event:0x4d12c counters:3 um:zero minimum:1000 name:PM_MRK_DATA_FROM_MEMORY_CYC : Duration in cycles to reload from a memory location including L4 from local remote or distant due to a marked load. ++event:0x4d14a counters:3 um:zero minimum:1000 name:PM_MRK_DATA_FROM_OFF_CHIP_CACHE : The processor's data cache was reloaded either shared or modified data from another core's L2/L3 on a different chip (remote or distant) due to a marked load. ++event:0x2d12a counters:1 um:zero minimum:1000 name:PM_MRK_DATA_FROM_OFF_CHIP_CACHE_CYC : Duration in cycles to reload either shared or modified data from another core's L2/L3 on a different chip (remote or distant) due to a marked load. ++event:0x1d148 counters:0 um:zero minimum:1000 name:PM_MRK_DATA_FROM_ON_CHIP_CACHE : The processor's data cache was reloaded either shared or modified data from another core's L2/L3 on the same chip due to a marked load. ++event:0x4c128 counters:3 um:zero minimum:1000 name:PM_MRK_DATA_FROM_ON_CHIP_CACHE_CYC : Duration in cycles to reload either shared or modified data from another core's L2/L3 on the same chip due to a marked load. ++event:0x2d146 counters:1 um:zero minimum:1000 name:PM_MRK_DATA_FROM_RL2L3_MOD : The processor's data cache was reloaded with Modified (M) data from another chip's L2 or L3 on the same Node or Group (Remote), as this chip due to a marked load. ++event:0x4d126 counters:3 um:zero minimum:1000 name:PM_MRK_DATA_FROM_RL2L3_MOD_CYC : Duration in cycles to reload with Modified (M) data from another chip's L2 or L3 on the same Node or Group (Remote), as this chip due to a marked load. ++event:0x1d14a counters:0 um:zero minimum:1000 name:PM_MRK_DATA_FROM_RL2L3_SHR : The processor's data cache was reloaded with Shared (S) data from another chip's L2 or L3 on the same Node or Group (Remote), as this chip due to a marked load. ++event:0x4c12a counters:3 um:zero minimum:1000 name:PM_MRK_DATA_FROM_RL2L3_SHR_CYC : Duration in cycles to reload with Shared (S) data from another chip's L2 or L3 on the same Node or Group (Remote), as this chip due to a marked load. ++event:0x2d14a counters:1 um:zero minimum:1000 name:PM_MRK_DATA_FROM_RL4 : The processor's data cache was reloaded from another chip's L4 on the same Node or Group ( Remote) due to a marked load. ++event:0x4d12a counters:3 um:zero minimum:1000 name:PM_MRK_DATA_FROM_RL4_CYC : Duration in cycles to reload from another chip's L4 on the same Node or Group ( Remote) due to a marked load. ++event:0x3d14a counters:2 um:zero minimum:1000 name:PM_MRK_DATA_FROM_RMEM : The processor's data cache was reloaded from another chip's memory on the same Node or Group ( Remote) due to a marked load. ++event:0x2c12a counters:1 um:zero minimum:1000 name:PM_MRK_DATA_FROM_RMEM_CYC : Duration in cycles to reload from another chip's memory on the same Node or Group ( Remote) due to a marked load. ++event:0x40118 counters:3 um:zero minimum:1000 name:PM_MRK_DCACHE_RELOAD_INTV : Combined Intervention event. ++event:0x4d154 counters:3 um:zero minimum:1000 name:PM_MRK_DERAT_MISS_16G : Marked Data ERAT Miss (Data TLB Access) page size 16G. ++event:0x3d154 counters:2 um:zero minimum:1000 name:PM_MRK_DERAT_MISS_16M : Marked Data ERAT Miss (Data TLB Access) page size 16M. ++event:0x1d156 counters:0 um:zero minimum:1000 name:PM_MRK_DERAT_MISS_4K : Marked Data ERAT Miss (Data TLB Access) page size 4K. ++event:0x2d154 counters:1 um:zero minimum:1000 name:PM_MRK_DERAT_MISS_64K : Marked Data ERAT Miss (Data TLB Access) page size 64K. ++event:0x20132 counters:1 um:zero minimum:1000 name:PM_MRK_DFU_FIN : Decimal Unit marked Instruction Finish. ++event:0x4f148 counters:3 um:zero minimum:1000 name:PM_MRK_DPTEG_FROM_DL2L3_MOD : A Page Table Entry was loaded into the TLB with Modified (M) data from another chip's L2 or L3 on a different Node or Group (Distant), as this chip due to a marked data side request. ++event:0x3f148 counters:2 um:zero minimum:1000 name:PM_MRK_DPTEG_FROM_DL2L3_SHR : A Page Table Entry was loaded into the TLB with Shared (S) data from another chip's L2 or L3 on a different Node or Group (Distant), as this chip due to a marked data side request. ++event:0x3f14c counters:2 um:zero minimum:1000 name:PM_MRK_DPTEG_FROM_DL4 : A Page Table Entry was loaded into the TLB from another chip's L4 on a different Node or Group (Distant) due to a marked data side request. ++event:0x4f14c counters:3 um:zero minimum:1000 name:PM_MRK_DPTEG_FROM_DMEM : A Page Table Entry was loaded into the TLB from another chip's memory on the same Node or Group (Distant) due to a marked data side request. ++event:0x1f142 counters:0 um:zero minimum:1000 name:PM_MRK_DPTEG_FROM_L2 : A Page Table Entry was loaded into the TLB from local core's L2 due to a marked data side request. ++event:0x4f146 counters:3 um:zero minimum:1000 name:PM_MRK_DPTEG_FROM_L21_MOD : A Page Table Entry was loaded into the TLB with Modified (M) data from another core's L2 on the same chip due to a marked data side request. ++event:0x3f146 counters:2 um:zero minimum:1000 name:PM_MRK_DPTEG_FROM_L21_SHR : A Page Table Entry was loaded into the TLB with Shared (S) data from another core's L2 on the same chip due to a marked data side request. ++event:0x1f14e counters:0 um:zero minimum:1000 name:PM_MRK_DPTEG_FROM_L2MISS : A Page Table Entry was loaded into the TLB from a localtion other than the local core's L2 due to a marked data side request. ++event:0x3f140 counters:2 um:zero minimum:1000 name:PM_MRK_DPTEG_FROM_L2_DISP_CONFLICT_LDHITST : A Page Table Entry was loaded into the TLB from local core's L2 with load hit store conflict due to a marked data side request. ++event:0x4f140 counters:3 um:zero minimum:1000 name:PM_MRK_DPTEG_FROM_L2_DISP_CONFLICT_OTHER : A Page Table Entry was loaded into the TLB from local core's L2 with dispatch conflict due to a marked data side request. ++event:0x2f140 counters:1 um:zero minimum:1000 name:PM_MRK_DPTEG_FROM_L2_MEPF : A Page Table Entry was loaded into the TLB from local core's L2 hit without dispatch conflicts on Mepf state. due to a marked data side request. ++event:0x1f140 counters:0 um:zero minimum:1000 name:PM_MRK_DPTEG_FROM_L2_NO_CONFLICT : A Page Table Entry was loaded into the TLB from local core's L2 without conflict due to a marked data side request. ++event:0x4f142 counters:3 um:zero minimum:1000 name:PM_MRK_DPTEG_FROM_L3 : A Page Table Entry was loaded into the TLB from local core's L3 due to a marked data side request. ++event:0x4f144 counters:3 um:zero minimum:1000 name:PM_MRK_DPTEG_FROM_L31_ECO_MOD : A Page Table Entry was loaded into the TLB with Modified (M) data from another core's ECO L3 on the same chip due to a marked data side request. ++event:0x3f144 counters:2 um:zero minimum:1000 name:PM_MRK_DPTEG_FROM_L31_ECO_SHR : A Page Table Entry was loaded into the TLB with Shared (S) data from another core's ECO L3 on the same chip due to a marked data side request. ++event:0x2f144 counters:1 um:zero minimum:1000 name:PM_MRK_DPTEG_FROM_L31_MOD : A Page Table Entry was loaded into the TLB with Modified (M) data from another core's L3 on the same chip due to a marked data side request. ++event:0x1f146 counters:0 um:zero minimum:1000 name:PM_MRK_DPTEG_FROM_L31_SHR : A Page Table Entry was loaded into the TLB with Shared (S) data from another core's L3 on the same chip due to a marked data side request. ++event:0x4f14e counters:3 um:zero minimum:1000 name:PM_MRK_DPTEG_FROM_L3MISS : A Page Table Entry was loaded into the TLB from a localtion other than the local core's L3 due to a marked data side request. ++event:0x3f142 counters:2 um:zero minimum:1000 name:PM_MRK_DPTEG_FROM_L3_DISP_CONFLICT : A Page Table Entry was loaded into the TLB from local core's L3 with dispatch conflict due to a marked data side request. ++event:0x2f142 counters:1 um:zero minimum:1000 name:PM_MRK_DPTEG_FROM_L3_MEPF : A Page Table Entry was loaded into the TLB from local core's L3 without dispatch conflicts hit on Mepf state. due to a marked data side request. ++event:0x1f144 counters:0 um:zero minimum:1000 name:PM_MRK_DPTEG_FROM_L3_NO_CONFLICT : A Page Table Entry was loaded into the TLB from local core's L3 without conflict due to a marked data side request. ++event:0x1f14c counters:0 um:zero minimum:1000 name:PM_MRK_DPTEG_FROM_LL4 : A Page Table Entry was loaded into the TLB from the local chip's L4 cache due to a marked data side request. ++event:0x2f148 counters:1 um:zero minimum:1000 name:PM_MRK_DPTEG_FROM_LMEM : A Page Table Entry was loaded into the TLB from the local chip's Memory due to a marked data side request. ++event:0x2f14c counters:1 um:zero minimum:1000 name:PM_MRK_DPTEG_FROM_MEMORY : A Page Table Entry was loaded into the TLB from a memory location including L4 from local remote or distant due to a marked data side request. ++event:0x4f14a counters:3 um:zero minimum:1000 name:PM_MRK_DPTEG_FROM_OFF_CHIP_CACHE : A Page Table Entry was loaded into the TLB either shared or modified data from another core's L2/L3 on a different chip (remote or distant) due to a marked data side request. ++event:0x1f148 counters:0 um:zero minimum:1000 name:PM_MRK_DPTEG_FROM_ON_CHIP_CACHE : A Page Table Entry was loaded into the TLB either shared or modified data from another core's L2/L3 on the same chip due to a marked data side request. ++event:0x2f146 counters:1 um:zero minimum:1000 name:PM_MRK_DPTEG_FROM_RL2L3_MOD : A Page Table Entry was loaded into the TLB with Modified (M) data from another chip's L2 or L3 on the same Node or Group (Remote), as this chip due to a marked data side request. ++event:0x1f14a counters:0 um:zero minimum:1000 name:PM_MRK_DPTEG_FROM_RL2L3_SHR : A Page Table Entry was loaded into the TLB with Shared (S) data from another chip's L2 or L3 on the same Node or Group (Remote), as this chip due to a marked data side request. ++event:0x2f14a counters:1 um:zero minimum:1000 name:PM_MRK_DPTEG_FROM_RL4 : A Page Table Entry was loaded into the TLB from another chip's L4 on the same Node or Group ( Remote) due to a marked data side request. ++event:0x3f14a counters:2 um:zero minimum:1000 name:PM_MRK_DPTEG_FROM_RMEM : A Page Table Entry was loaded into the TLB from another chip's memory on the same Node or Group ( Remote) due to a marked data side request. ++event:0x1d158 counters:0 um:zero minimum:1000 name:PM_MRK_DTLB_MISS_16G : Marked Data TLB Miss page size 16G. ++event:0x4d156 counters:3 um:zero minimum:1000 name:PM_MRK_DTLB_MISS_16M : Marked Data TLB Miss page size 16M. ++event:0x2d156 counters:1 um:zero minimum:1000 name:PM_MRK_DTLB_MISS_4K : Marked Data TLB Miss page size 4k. ++event:0x3d156 counters:2 um:zero minimum:1000 name:PM_MRK_DTLB_MISS_64K : Marked Data TLB Miss page size 64K. ++event:0x40154 counters:3 um:zero minimum:1000 name:PM_MRK_FAB_RSP_BKILL : Marked store had to do a bkill. ++event:0x2f150 counters:1 um:zero minimum:1000 name:PM_MRK_FAB_RSP_BKILL_CYC : cycles L2 RC took for a bkill. ++event:0x3015e counters:2 um:zero minimum:1000 name:PM_MRK_FAB_RSP_CLAIM_RTY : Sampled store did a rwitm and got a rty. ++event:0x30154 counters:2 um:zero minimum:1000 name:PM_MRK_FAB_RSP_DCLAIM : Marked store had to do a dclaim. ++event:0x2f152 counters:1 um:zero minimum:1000 name:PM_MRK_FAB_RSP_DCLAIM_CYC : cycles L2 RC took for a dclaim. ++event:0x30156 counters:2 um:zero minimum:1000 name:PM_MRK_FAB_RSP_MATCH : ttype and cresp matched as specified in MMCR1. ++event:0x4f152 counters:3 um:zero minimum:1000 name:PM_MRK_FAB_RSP_MATCH_CYC : cresp/ttype match cycles. ++event:0x4015e counters:3 um:zero minimum:1000 name:PM_MRK_FAB_RSP_RD_RTY : Sampled L2 reads retry count. ++event:0x1015e counters:0 um:zero minimum:1000 name:PM_MRK_FAB_RSP_RD_T_INTV : Sampled Read got a T intervention. ++event:0x4f150 counters:3 um:zero minimum:1000 name:PM_MRK_FAB_RSP_RWITM_CYC : cycles L2 RC took for a rwitm. ++event:0x2015e counters:1 um:zero minimum:1000 name:PM_MRK_FAB_RSP_RWITM_RTY : Sampled store did a rwitm and got a rty. ++event:0x3012e counters:2 um:zero minimum:1000 name:PM_MRK_FILT_MATCH : Marked filter Match. ++event:0x1013c counters:0 um:zero minimum:1000 name:PM_MRK_FIN_STALL_CYC : Marked instruction Finish Stall cycles (marked finish after NTC) (use edge detect to count #). ++event:0x20134 counters:1 um:zero minimum:1000 name:PM_MRK_FXU_FIN : fxu marked instr finish. + event:0x40130 counters:3 um:zero minimum:1000 name:PM_MRK_GRP_CMPL : marked instruction finished (completed). ++event:0x4013a counters:3 um:zero minimum:1000 name:PM_MRK_GRP_IC_MISS : Marked Group experienced I cache miss. ++event:0x3013c counters:2 um:zero minimum:1000 name:PM_MRK_GRP_NTC : Marked group ntc cycles. + event:0x20130 counters:1 um:zero minimum:1000 name:PM_MRK_INST_DECODED : marked instruction decoded. Name from ISU? ++event:0x30130 counters:2 um:zero minimum:1000 name:PM_MRK_INST_FIN : marked instr finish any unit . ++event:0x10132 counters:0 um:zero minimum:1000 name:PM_MRK_INST_ISSUED : Marked instruction issued. ++event:0x40134 counters:3 um:zero minimum:1000 name:PM_MRK_INST_TIMEO : marked Instruction finish timeout (instruction lost). + event:0x20114 counters:1 um:zero minimum:1000 name:PM_MRK_L2_RC_DISP : Marked Instruction RC dispatched in L2. ++event:0x3012a counters:2 um:zero minimum:1000 name:PM_MRK_L2_RC_DONE : Marked RC done. ++event:0x40116 counters:3 um:zero minimum:1000 name:PM_MRK_LARX_FIN : Larx finished . ++event:0x1013f counters:0 um:zero minimum:1000 name:PM_MRK_LD_MISS_EXPOSED : Marked Load exposed Miss (use edge detect to count #) ++event:0x1013e counters:0 um:zero minimum:1000 name:PM_MRK_LD_MISS_EXPOSED_CYC : Marked Load exposed Miss (use edge detect to count #). + event:0x4013e counters:3 um:zero minimum:1000 name:PM_MRK_LD_MISS_L1_CYC : Marked ld latency. ++event:0x40132 counters:3 um:zero minimum:1000 name:PM_MRK_LSU_FIN : lsu marked instr finish. ++event:0xd180 counters:0,1,2,3 um:zero minimum:1000 name:PM_MRK_LSU_FLUSH : Flush: (marked) : All Cases42 ++event:0xd188 counters:0,1,2,3 um:zero minimum:1000 name:PM_MRK_LSU_FLUSH_LRQ : Flush: (marked) LRQMarked LRQ flushes ++event:0xd18a counters:0,1,2,3 um:zero minimum:1000 name:PM_MRK_LSU_FLUSH_SRQ : Flush: (marked) SRQMarked SRQ lhs flushes ++event:0xd184 counters:0,1,2,3 um:zero minimum:1000 name:PM_MRK_LSU_FLUSH_ULD : Flush: (marked) Unaligned LoadMarked unaligned load flushes ++event:0xd186 counters:0,1,2,3 um:zero minimum:1000 name:PM_MRK_LSU_FLUSH_UST : Flush: (marked) Unaligned StoreMarked unaligned store flushes ++event:0x40164 counters:3 um:zero minimum:1000 name:PM_MRK_LSU_REJECT : LSU marked reject (up to 2 per cycle). ++event:0x30164 counters:2 um:zero minimum:1000 name:PM_MRK_LSU_REJECT_ERAT_MISS : LSU marked reject due to ERAT (up to 2 per cycle). ++event:0x20112 counters:1 um:zero minimum:1000 name:PM_MRK_NTF_FIN : Marked next to finish instruction finished. ++event:0x1d15e counters:0 um:zero minimum:10000 name:PM_MRK_RUN_CYC : Marked run cycles. ++event:0x1d15a counters:0 um:zero minimum:1000 name:PM_MRK_SRC_PREF_TRACK_EFF : Marked src pref track was effective. ++event:0x3d15a counters:2 um:zero minimum:1000 name:PM_MRK_SRC_PREF_TRACK_INEFF : Prefetch tracked was ineffective for marked src. ++event:0x4d15c counters:3 um:zero minimum:1000 name:PM_MRK_SRC_PREF_TRACK_MOD : Prefetch tracked was moderate for marked src. ++event:0x1d15c counters:0 um:zero minimum:1000 name:PM_MRK_SRC_PREF_TRACK_MOD_L2 : Marked src Prefetch Tracked was moderate (source L2). ++event:0x3d15c counters:2 um:zero minimum:1000 name:PM_MRK_SRC_PREF_TRACK_MOD_L3 : Prefetch tracked was moderate (L3 hit) for marked src. + event:0x3013e counters:2 um:zero minimum:1000 name:PM_MRK_STALL_CMPLU_CYC : Marked Group Completion Stall cycles (use edge detect to count #). ++event:0x3e158 counters:2 um:zero minimum:1000 name:PM_MRK_STCX_FAIL : marked stcx failed. ++event:0x30134 counters:2 um:zero minimum:1000 name:PM_MRK_ST_CMPL_INT : marked store complete (data home) with intervention. ++event:0x3f150 counters:2 um:zero minimum:1000 name:PM_MRK_ST_DRAIN_TO_L2DISP_CYC : cycles to drain st from core to L2. ++event:0x3012c counters:2 um:zero minimum:1000 name:PM_MRK_ST_FWD : Marked st forwards. ++event:0x1f150 counters:0 um:zero minimum:1000 name:PM_MRK_ST_L2DISP_TO_CMPL_CYC : cycles from L2 rc disp to l2 rc completion. ++event:0x20138 counters:1 um:zero minimum:1000 name:PM_MRK_ST_NEST : Marked store sent to nest. ++event:0x1c15a counters:0 um:zero minimum:1000 name:PM_MRK_TGT_PREF_TRACK_EFF : Marked target pref track was effective. ++event:0x3c15a counters:2 um:zero minimum:1000 name:PM_MRK_TGT_PREF_TRACK_INEFF : Prefetch tracked was ineffective for marked target. ++event:0x4c15c counters:3 um:zero minimum:1000 name:PM_MRK_TGT_PREF_TRACK_MOD : Prefetch tracked was moderate for marked target. ++event:0x1c15c counters:0 um:zero minimum:1000 name:PM_MRK_TGT_PREF_TRACK_MOD_L2 : Marked target Prefetch Tracked was moderate (source L2). ++event:0x3c15c counters:2 um:zero minimum:1000 name:PM_MRK_TGT_PREF_TRACK_MOD_L3 : Prefetch tracked was moderate (L3 hit) for marked target. ++event:0x30132 counters:2 um:zero minimum:1000 name:PM_MRK_VSU_FIN : vsu (fpu) marked instr finish. ++event:0x3d15e counters:2 um:zero minimum:10000 name:PM_MULT_MRK : mult marked instr. ++event:0x20b0 counters:0,1,2,3 um:zero minimum:10000 name:PM_NESTED_TEND : Completion time nested tend + event:0x3006e counters:2 um:zero minimum:10000 name:PM_NEST_REF_CLK : Nest reference clocks. +-event:0x2001a counters:1 um:zero minimum:10000 name:PM_NTCG_ALL_FIN : Cycles after all instructions have finished to group completed. ++event:0x20b6 counters:0,1,2,3 um:zero minimum:10000 name:PM_NON_FAV_TBEGIN : Dispatch time non favored tbegin ++event:0x328084 counters:1 um:zero minimum:10000 name:PM_NON_TM_RST_SC : non tm snp rst tm sc ++event:0x2001a counters:1 um:zero minimum:10000 name:PM_NTCG_ALL_FIN : Ccycles after all instructions have finished to group completed. ++event:0x20ac counters:0,1,2,3 um:zero minimum:10000 name:PM_OUTER_TBEGIN : Completion time outer tbegin ++event:0x20ae counters:0,1,2,3 um:zero minimum:10000 name:PM_OUTER_TEND : Completion time outer tend + event:0x20010 counters:1 um:zero minimum:10000 name:PM_PMC1_OVERFLOW : Overflow from counter 1. + event:0x30010 counters:2 um:zero minimum:10000 name:PM_PMC2_OVERFLOW : Overflow from counter 2. ++event:0x30020 counters:2 um:zero minimum:10000 name:PM_PMC2_REWIND : PMC2 Rewind Event (did not match condition). ++event:0x10022 counters:0 um:zero minimum:10000 name:PM_PMC2_SAVED : PMC2 Rewind Value saved (matched condition). + event:0x40010 counters:3 um:zero minimum:10000 name:PM_PMC3_OVERFLOW : Overflow from counter 3. + event:0x10010 counters:0 um:zero minimum:10000 name:PM_PMC4_OVERFLOW : Overflow from counter 4. ++event:0x10020 counters:0 um:zero minimum:10000 name:PM_PMC4_REWIND : PMC4 Rewind Event (did not match condition). ++event:0x30022 counters:2 um:zero minimum:10000 name:PM_PMC4_SAVED : PMC4 Rewind Value saved (matched condition). ++event:0x10024 counters:0 um:zero minimum:10000 name:PM_PMC5_OVERFLOW : Overflow from counter 5. + event:0x30024 counters:2 um:zero minimum:10000 name:PM_PMC6_OVERFLOW : Overflow from counter 6. ++event:0x2005a counters:1 um:zero minimum:10000 name:PM_PREF_TRACKED : Total number of Prefetch Operations that were tracked. ++event:0x1005a counters:0 um:zero minimum:10000 name:PM_PREF_TRACK_EFF : Prefetch Tracked was effective. ++event:0x3005a counters:2 um:zero minimum:10000 name:PM_PREF_TRACK_INEFF : Prefetch tracked was ineffective. ++event:0x4005a counters:3 um:zero minimum:10000 name:PM_PREF_TRACK_MOD : Prefetch tracked was moderate. ++event:0x1005c counters:0 um:zero minimum:10000 name:PM_PREF_TRACK_MOD_L2 : Prefetch Tracked was moderate (source L2). ++event:0x3005c counters:2 um:zero minimum:10000 name:PM_PREF_TRACK_MOD_L3 : Prefetch tracked was moderate (L3). ++event:0x40014 counters:3 um:zero minimum:10000 name:PM_PROBE_NOP_DISP : ProbeNops dispatched. ++event:0xe084 counters:0,1,2,3 um:zero minimum:10000 name:PM_PTE_PREFETCH : PTE prefetches42 ++event:0x10054 counters:0 um:zero minimum:10000 name:PM_PUMP_CPRED : Pump prediction correct. Counts across all types of pumpsfor all data types excluding data prefetch (demand load,inst prefetch,inst fetch,xlate). ++event:0x40052 counters:3 um:zero minimum:10000 name:PM_PUMP_MPRED : Pump Mis prediction Counts across all types of pumpsfor all data types excluding data prefetch (demand load,inst prefetch,inst fetch,xlate). ++event:0x16081 counters:0 um:zero minimum:10000 name:PM_RC0_ALLOC : 0.0 ++event:0x16080 counters:0 um:zero minimum:10000 name:PM_RC0_BUSY : RC mach 0 Busy. Used by PMU to sample ave RC livetime(mach0 used as sample point) ++event:0x200301ea counters:2 um:zero minimum:10000 name:PM_RC_LIFETIME_EXC_1024 : Reload latency exceeded 1024 cyc ++event:0x200401ec counters:3 um:zero minimum:10000 name:PM_RC_LIFETIME_EXC_2048 : Threshold counter exceeded a value of 2048 ++event:0x200101e8 counters:0 um:zero minimum:10000 name:PM_RC_LIFETIME_EXC_256 : Threshold counter exceed a count of 256 ++event:0x200201e6 counters:1 um:zero minimum:10000 name:PM_RC_LIFETIME_EXC_32 : Reload latency exceeded 32 cyc ++event:0x36088 counters:2 um:zero minimum:10000 name:PM_RC_USAGE : Continuous 16 cycle(2to1) window where this signals rotates thru sampling each L2 RC machine busy. PMU uses this wave to then do 16 cyc count to sample total number of machs running ++event:0x34808e counters:3 um:zero minimum:10000 name:PM_RD_CLEARING_SC : rd clearing sc ++event:0x34808c counters:3 um:zero minimum:10000 name:PM_RD_FORMING_SC : rd forming sc ++event:0x428086 counters:1 um:zero minimum:10000 name:PM_RD_HIT_PF : rd machine hit l3 pf machine ++event:0x20004 counters:1 um:zero minimum:10000 name:PM_REAL_SRQ_FULL : Out of real srq entries. ++event:0x3006c counters:2 um:zero minimum:10000 name:PM_RUN_CYC_SMT2_MODE : Cycles run latch is set and core is in SMT2 mode. ++event:0x2006a counters:1 um:zero minimum:10000 name:PM_RUN_CYC_SMT2_SHRD_MODE : Cycles run latch is set and core is in SMT2-shared mode. ++event:0x1006a counters:0 um:zero minimum:100000 name:PM_RUN_CYC_SMT2_SPLIT_MODE : Cycles run latch is set and core is in SMT2-split mode. ++event:0x2006c counters:1 um:zero minimum:10000 name:PM_RUN_CYC_SMT4_MODE : Cycles run latch is set and core is in SMT4 mode. ++event:0x4006c counters:3 um:zero minimum:100000 name:PM_RUN_CYC_SMT8_MODE : Cycles run latch is set and core is in SMT8 mode. ++event:0x1006c counters:0 um:zero minimum:100000 name:PM_RUN_CYC_ST_MODE : Cycles run latch is set and core is in ST mode. ++event:0x10008 counters:0 um:zero minimum:10000 name:PM_RUN_SPURR : Run SPURR. ++event:0xf082 counters:0,1,2,3 um:zero minimum:10000 name:PM_SEC_ERAT_HIT : secondary ERAT Hit42 ++event:0x508c counters:0,1,2,3 um:zero minimum:10000 name:PM_SHL_CREATED : Store-Hit-Load Table Entry Created ++event:0x508e counters:0,1,2,3 um:zero minimum:10000 name:PM_SHL_ST_CONVERT : Store-Hit-Load Table Read Hit with entry Enabled ++event:0x5090 counters:0,1,2,3 um:zero minimum:10000 name:PM_SHL_ST_DISABLE : Store-Hit-Load Table Read Hit with entry Disabled (entry was disabled due to the entry shown to not prevent the flush) ++event:0x26085 counters:1 um:zero minimum:10000 name:PM_SN0_ALLOC : 0.0 ++event:0x26084 counters:1 um:zero minimum:10000 name:PM_SN0_BUSY : SN mach 0 Busy. Used by PMU to sample ave RC livetime(mach0 used as sample point) ++event:0xd0b2 counters:0,1,2,3 um:zero minimum:10000 name:PM_SNOOP_TLBIE : TLBIE snoopSnoop TLBIE ++event:0x338088 counters:2 um:zero minimum:10000 name:PM_SNP_TM_HIT_M : snp tm st hit m mu ++event:0x33808a counters:2 um:zero minimum:10000 name:PM_SNP_TM_HIT_T : snp tm_st_hit t tn te ++event:0x4608c counters:3 um:zero minimum:10000 name:PM_SN_USAGE : Continuous 16 cycle(2to1) window where this signals rotates thru sampling each L2 SN machine busy. PMU uses this wave to then do 16 cyc count to sample total number of machs running ++event:0x10028 counters:0 um:zero minimum:10000 name:PM_STALL_END_GCT_EMPTY : Count ended because GCT went empty. ++event:0x1e058 counters:0 um:zero minimum:10000 name:PM_STCX_FAIL : stcx failed . ++event:0xc090 counters:0,1,2,3 um:zero minimum:10000 name:PM_STCX_LSU : STCX executed reported at sent to nest42 ++event:0x717080 counters:0 um:zero minimum:10000 name:PM_ST_CAUSED_FAIL : Non TM St caused any thread to fail ++event:0x20016 counters:1 um:zero minimum:10000 name:PM_ST_CMPL : Store completion count. ++event:0x20018 counters:1 um:zero minimum:10000 name:PM_ST_FWD : Store forwards that finished. ++event:0x0 counters:0,1,2,3 um:zero minimum:10000 name:PM_SUSPENDED : Counter OFF. ++event:0x3090 counters:0,1,2,3 um:zero minimum:10000 name:PM_SWAP_CANCEL : SWAP cancel , rtag not available ++event:0x3092 counters:0,1,2,3 um:zero minimum:10000 name:PM_SWAP_CANCEL_GPR : SWAP cancel , rtag not available for gpr ++event:0x308c counters:0,1,2,3 um:zero minimum:10000 name:PM_SWAP_COMPLETE : swap cast in completed ++event:0x308e counters:0,1,2,3 um:zero minimum:10000 name:PM_SWAP_COMPLETE_GPR : swap cast in completed fpr gpr ++event:0x15152 counters:0 um:zero minimum:10000 name:PM_SYNC_MRK_BR_LINK : Marked Branch and link branch that can cause a synchronous interrupt. ++event:0x1515c counters:0 um:zero minimum:10000 name:PM_SYNC_MRK_BR_MPRED : Marked Branch mispredict that can cause a synchronous interrupt. ++event:0x15156 counters:0 um:zero minimum:10000 name:PM_SYNC_MRK_FX_DIVIDE : Marked fixed point divide that can cause a synchronous interrupt. ++event:0x15158 counters:0 um:zero minimum:10000 name:PM_SYNC_MRK_L2HIT : Marked L2 Hits that can throw a synchronous interrupt. ++event:0x1515a counters:0 um:zero minimum:10000 name:PM_SYNC_MRK_L2MISS : Marked L2 Miss that can throw a synchronous interrupt. ++event:0x15154 counters:0 um:zero minimum:10000 name:PM_SYNC_MRK_L3MISS : Marked L3 misses that can throw a synchronous interrupt. ++event:0x15150 counters:0 um:zero minimum:10000 name:PM_SYNC_MRK_PROBE_NOP : Marked probeNops which can cause synchronous interrupts. ++event:0x30050 counters:2 um:zero minimum:10000 name:PM_SYS_PUMP_CPRED : Initial and Final Pump Scope and data sourced across this scope was system pump for all data types excluding data prefetch (demand load,inst prefetch,inst fetch,xlate). ++event:0x30052 counters:2 um:zero minimum:10000 name:PM_SYS_PUMP_MPRED : Final Pump Scope(system) to get data sourced, ended up larger than Initial Pump Scope(Chip/Group) OR Final Pump Scope(system) got data from source that was at smaller scope(Chip/group) Final pump was system pump and initial pump was chip or group or ++event:0x40050 counters:3 um:zero minimum:10000 name:PM_SYS_PUMP_MPRED_RTY : Final Pump Scope(system) to get data sourced, ended up larger than Initial Pump Scope (Chip or Group) for all data types excluding data prefetch (demand load,inst prefetch,inst fetch,xlate). ++event:0x10026 counters:0 um:zero minimum:10000 name:PM_TABLEWALK_CYC : Tablewalk Active. ++event:0xe086 counters:0,1,2,3 um:zero minimum:10000 name:PM_TABLEWALK_CYC_PREF : tablewalk qualified for pte prefetches42 ++event:0x20b2 counters:0,1,2,3 um:zero minimum:10000 name:PM_TABORT_TRECLAIM : Completion time tabortnoncd, tabortcd, treclaim ++event:0xe0ba counters:0,1,2,3 um:zero minimum:10000 name:PM_TEND_PEND_CYC : TEND latency per thread42 + event:0x2000c counters:1 um:zero minimum:100000 name:PM_THRD_ALL_RUN_CYC : All Threads in Run_cycles (was both threads in run_cycles). ++event:0x10012 counters:0 um:zero minimum:10000 name:PM_THRD_GRP_CMPL_BOTH_CYC : Two threads finished same cycle (gated by run latch). ++event:0x40bc counters:0,1,2,3 um:zero minimum:1000 name:PM_THRD_PRIO_0_1_CYC : Cycles thread running at priority level 0 or 1 ++event:0x40be counters:0,1,2,3 um:zero minimum:1000 name:PM_THRD_PRIO_2_3_CYC : Cycles thread running at priority level 2 or 3 ++event:0x5080 counters:0,1,2,3 um:zero minimum:1000 name:PM_THRD_PRIO_4_5_CYC : Cycles thread running at priority level 4 or 5 ++event:0x5082 counters:0,1,2,3 um:zero minimum:1000 name:PM_THRD_PRIO_6_7_CYC : Cycles thread running at priority level 6 or 7 ++event:0x3098 counters:0,1,2,3 um:zero minimum:10000 name:PM_THRD_REBAL_CYC : cycles rebalance was active + event:0x4016e counters:3 um:zero minimum:10000 name:PM_THRESH_NOT_MET : Threshold counter did not meet threshold. ++event:0x30058 counters:2 um:zero minimum:10000 name:PM_TLBIE_FIN : tlbie finished. ++event:0x20066 counters:1 um:zero minimum:10000 name:PM_TLB_MISS : TLB Miss (I + D). ++event:0x20b8 counters:0,1,2,3 um:zero minimum:10000 name:PM_TM_BEGIN_ALL : Tm any tbegin ++event:0x318082 counters:0 um:zero minimum:10000 name:PM_TM_CAM_OVERFLOW : l3 tm cam overflow during L2 co of SC ++event:0x74708c counters:3 um:zero minimum:10000 name:PM_TM_CAP_OVERFLOW : TM Footprint Capactiy Overflow ++event:0x20ba counters:0,1,2,3 um:zero minimum:10000 name:PM_TM_END_ALL : Tm any tend ++event:0x3086 counters:0,1,2,3 um:zero minimum:10000 name:PM_TM_FAIL_CONF_NON_TM : TEXAS fail reason @ completion ++event:0x3088 counters:0,1,2,3 um:zero minimum:10000 name:PM_TM_FAIL_CON_TM : TEXAS fail reason @ completion ++event:0xe0b2 counters:0,1,2,3 um:zero minimum:10000 name:PM_TM_FAIL_DISALLOW : TM fail disallow42 ++event:0x3084 counters:0,1,2,3 um:zero minimum:10000 name:PM_TM_FAIL_FOOTPRINT_OVERFLOW : TEXAS fail reason @ completion ++event:0xe0b8 counters:0,1,2,3 um:zero minimum:10000 name:PM_TM_FAIL_NON_TX_CONFLICT : Non transactional conflict from LSU whtver gets repoted to texas42 ++event:0x308a counters:0,1,2,3 um:zero minimum:10000 name:PM_TM_FAIL_SELF : TEXAS fail reason @ completion ++event:0xe0b4 counters:0,1,2,3 um:zero minimum:10000 name:PM_TM_FAIL_TLBIE : TLBIE hit bloom filter42 ++event:0xe0b6 counters:0,1,2,3 um:zero minimum:10000 name:PM_TM_FAIL_TX_CONFLICT : Transactional conflict from LSU, whatever gets reported to texas 42 ++event:0x727086 counters:1 um:zero minimum:10000 name:PM_TM_FAV_CAUSED_FAIL : TM Load (fav) caused another thread to fail ++event:0x717082 counters:0 um:zero minimum:10000 name:PM_TM_LD_CAUSED_FAIL : Non TM Ld caused any thread to fail ++event:0x727084 counters:1 um:zero minimum:10000 name:PM_TM_LD_CONF : TM Load (fav or non-fav) ran into conflict (failed) ++event:0x328086 counters:1 um:zero minimum:10000 name:PM_TM_RST_SC : tm snp rst tm sc ++event:0x318080 counters:0 um:zero minimum:10000 name:PM_TM_SC_CO : l3 castout tm Sc line ++event:0x73708a counters:2 um:zero minimum:10000 name:PM_TM_ST_CAUSED_FAIL : TM Store (fav or non-fav) caused another thread to fail ++event:0x737088 counters:2 um:zero minimum:10000 name:PM_TM_ST_CONF : TM Store (fav or non-fav) ran into conflict (failed) ++event:0x20bc counters:0,1,2,3 um:zero minimum:10000 name:PM_TM_TBEGIN : Tm nested tbegin ++event:0x10060 counters:0 um:zero minimum:10000 name:PM_TM_TRANS_RUN_CYC : run cycles in transactional state. ++event:0x30060 counters:2 um:zero minimum:10000 name:PM_TM_TRANS_RUN_INST : Instructions completed in transactional state. ++event:0x3080 counters:0,1,2,3 um:zero minimum:10000 name:PM_TM_TRESUME : Tm resume ++event:0x20be counters:0,1,2,3 um:zero minimum:10000 name:PM_TM_TSUSPEND : Tm suspend ++event:0x2e012 counters:1 um:zero minimum:10000 name:PM_TM_TX_PASS_RUN_CYC : run cycles spent in successful transactions. ++event:0x4e014 counters:3 um:zero minimum:10000 name:PM_TM_TX_PASS_RUN_INST : run instructions spent in successful transactions. ++event:0xe08c counters:0,1,2,3 um:zero minimum:10000 name:PM_UP_PREF_L3 : Micropartition prefetch42 ++event:0xe08e counters:0,1,2,3 um:zero minimum:10000 name:PM_UP_PREF_POINTER : Micrpartition pointer prefetches42 ++event:0xa0a4 counters:0,1,2,3 um:zero minimum:10000 name:PM_VSU0_16FLOP : Sixteen flops operation (SP vector versions of fdiv,fsqrt) ++event:0xa080 counters:0,1,2,3 um:zero minimum:10000 name:PM_VSU0_1FLOP : one flop (fadd, fmul, fsub, fcmp, fsel, fabs, fnabs, fres, fsqrte, fneg) operation finishedDecode into 1,2,4 FLOP according to instr IOP, multiplied by #vector elements according to route( eg x1, x2, x4) Only if instr sends finish to ISU ++event:0xa098 counters:0,1,2,3 um:zero minimum:10000 name:PM_VSU0_2FLOP : two flops operation (scalar fmadd, fnmadd, fmsub, fnmsub and DP vector versions of single flop instructions) ++event:0xa09c counters:0,1,2,3 um:zero minimum:10000 name:PM_VSU0_4FLOP : four flops operation (scalar fdiv, fsqrt, DP vector version of fmadd, fnmadd, fmsub, fnmsub, SP vector versions of single flop instructions) ++event:0xa0a0 counters:0,1,2,3 um:zero minimum:10000 name:PM_VSU0_8FLOP : eight flops operation (DP vector versions of fdiv,fsqrt and SP vector versions of fmadd,fnmadd,fmsub,fnmsub) ++event:0xb0a4 counters:0,1,2,3 um:zero minimum:10000 name:PM_VSU0_COMPLEX_ISSUED : Complex VMX instruction issued ++event:0xb0b4 counters:0,1,2,3 um:zero minimum:10000 name:PM_VSU0_CY_ISSUED : Cryptographic instruction RFC02196 Issued ++event:0xb0a8 counters:0,1,2,3 um:zero minimum:10000 name:PM_VSU0_DD_ISSUED : 64BIT Decimal Issued ++event:0xa08c counters:0,1,2,3 um:zero minimum:10000 name:PM_VSU0_DP_2FLOP : DP vector version of fmul, fsub, fcmp, fsel, fabs, fnabs, fres ,fsqrte, fneg ++event:0xa090 counters:0,1,2,3 um:zero minimum:10000 name:PM_VSU0_DP_FMA : DP vector version of fmadd,fnmadd,fmsub,fnmsub ++event:0xa094 counters:0,1,2,3 um:zero minimum:10000 name:PM_VSU0_DP_FSQRT_FDIV : DP vector versions of fdiv,fsqrt ++event:0xb0ac counters:0,1,2,3 um:zero minimum:10000 name:PM_VSU0_DQ_ISSUED : 128BIT Decimal Issued ++event:0xb0b0 counters:0,1,2,3 um:zero minimum:10000 name:PM_VSU0_EX_ISSUED : Direct move 32/64b VRFtoGPR RFC02206 Issued ++event:0xa0bc counters:0,1,2,3 um:zero minimum:10000 name:PM_VSU0_FIN : VSU0 Finished an instruction ++event:0xa084 counters:0,1,2,3 um:zero minimum:10000 name:PM_VSU0_FMA : two flops operation (fmadd, fnmadd, fmsub, fnmsub) Scalar instructions only! ++event:0xb098 counters:0,1,2,3 um:zero minimum:10000 name:PM_VSU0_FPSCR : Move to/from FPSCR type instruction issued on Pipe 0 ++event:0xa088 counters:0,1,2,3 um:zero minimum:10000 name:PM_VSU0_FSQRT_FDIV : four flops operation (fdiv,fsqrt) Scalar Instructions only! ++event:0xb090 counters:0,1,2,3 um:zero minimum:10000 name:PM_VSU0_PERMUTE_ISSUED : Permute VMX Instruction Issued ++event:0xb088 counters:0,1,2,3 um:zero minimum:10000 name:PM_VSU0_SCALAR_DP_ISSUED : Double Precision scalar instruction issued on Pipe0 ++event:0xb094 counters:0,1,2,3 um:zero minimum:10000 name:PM_VSU0_SIMPLE_ISSUED : Simple VMX instruction issued ++event:0xa0a8 counters:0,1,2,3 um:zero minimum:10000 name:PM_VSU0_SINGLE : FPU single precision ++event:0xb09c counters:0,1,2,3 um:zero minimum:10000 name:PM_VSU0_SQ : Store Vector Issued ++event:0xb08c counters:0,1,2,3 um:zero minimum:10000 name:PM_VSU0_STF : FPU store (SP or DP) issued on Pipe0 ++event:0xb080 counters:0,1,2,3 um:zero minimum:10000 name:PM_VSU0_VECTOR_DP_ISSUED : Double Precision vector instruction issued on Pipe0 ++event:0xb084 counters:0,1,2,3 um:zero minimum:10000 name:PM_VSU0_VECTOR_SP_ISSUED : Single Precision vector instruction issued (executed) ++event:0xa0a6 counters:0,1,2,3 um:zero minimum:10000 name:PM_VSU1_16FLOP : Sixteen flops operation (SP vector versions of fdiv,fsqrt) ++event:0xa082 counters:0,1,2,3 um:zero minimum:10000 name:PM_VSU1_1FLOP : one flop (fadd, fmul, fsub, fcmp, fsel, fabs, fnabs, fres, fsqrte, fneg) operation finished ++event:0xa09a counters:0,1,2,3 um:zero minimum:10000 name:PM_VSU1_2FLOP : two flops operation (scalar fmadd, fnmadd, fmsub, fnmsub and DP vector versions of single flop instructions) ++event:0xa09e counters:0,1,2,3 um:zero minimum:10000 name:PM_VSU1_4FLOP : four flops operation (scalar fdiv, fsqrt, DP vector version of fmadd, fnmadd, fmsub, fnmsub, SP vector versions of single flop instructions) ++event:0xa0a2 counters:0,1,2,3 um:zero minimum:10000 name:PM_VSU1_8FLOP : eight flops operation (DP vector versions of fdiv,fsqrt and SP vector versions of fmadd,fnmadd,fmsub,fnmsub) ++event:0xb0a6 counters:0,1,2,3 um:zero minimum:10000 name:PM_VSU1_COMPLEX_ISSUED : Complex VMX instruction issued ++event:0xb0b6 counters:0,1,2,3 um:zero minimum:10000 name:PM_VSU1_CY_ISSUED : Cryptographic instruction RFC02196 Issued ++event:0xb0aa counters:0,1,2,3 um:zero minimum:10000 name:PM_VSU1_DD_ISSUED : 64BIT Decimal Issued ++event:0xa08e counters:0,1,2,3 um:zero minimum:10000 name:PM_VSU1_DP_2FLOP : DP vector version of fmul, fsub, fcmp, fsel, fabs, fnabs, fres ,fsqrte, fneg ++event:0xa092 counters:0,1,2,3 um:zero minimum:10000 name:PM_VSU1_DP_FMA : DP vector version of fmadd,fnmadd,fmsub,fnmsub ++event:0xa096 counters:0,1,2,3 um:zero minimum:10000 name:PM_VSU1_DP_FSQRT_FDIV : DP vector versions of fdiv,fsqrt ++event:0xb0ae counters:0,1,2,3 um:zero minimum:10000 name:PM_VSU1_DQ_ISSUED : 128BIT Decimal Issued ++event:0xb0b2 counters:0,1,2,3 um:zero minimum:10000 name:PM_VSU1_EX_ISSUED : Direct move 32/64b VRFtoGPR RFC02206 Issued ++event:0xa0be counters:0,1,2,3 um:zero minimum:10000 name:PM_VSU1_FIN : VSU1 Finished an instruction ++event:0xa086 counters:0,1,2,3 um:zero minimum:10000 name:PM_VSU1_FMA : two flops operation (fmadd, fnmadd, fmsub, fnmsub) Scalar instructions only! ++event:0xb09a counters:0,1,2,3 um:zero minimum:10000 name:PM_VSU1_FPSCR : Move to/from FPSCR type instruction issued on Pipe 0 ++event:0xa08a counters:0,1,2,3 um:zero minimum:10000 name:PM_VSU1_FSQRT_FDIV : four flops operation (fdiv,fsqrt) Scalar Instructions only! ++event:0xb092 counters:0,1,2,3 um:zero minimum:10000 name:PM_VSU1_PERMUTE_ISSUED : Permute VMX Instruction Issued ++event:0xb08a counters:0,1,2,3 um:zero minimum:10000 name:PM_VSU1_SCALAR_DP_ISSUED : Double Precision scalar instruction issued on Pipe1 ++event:0xb096 counters:0,1,2,3 um:zero minimum:10000 name:PM_VSU1_SIMPLE_ISSUED : Simple VMX instruction issued ++event:0xa0aa counters:0,1,2,3 um:zero minimum:10000 name:PM_VSU1_SINGLE : FPU single precision ++event:0xb09e counters:0,1,2,3 um:zero minimum:10000 name:PM_VSU1_SQ : Store Vector Issued ++event:0xb08e counters:0,1,2,3 um:zero minimum:10000 name:PM_VSU1_STF : FPU store (SP or DP) issued on Pipe1 ++event:0xb082 counters:0,1,2,3 um:zero minimum:10000 name:PM_VSU1_VECTOR_DP_ISSUED : Double Precision vector instruction issued on Pipe1 ++event:0xb086 counters:0,1,2,3 um:zero minimum:10000 name:PM_VSU1_VECTOR_SP_ISSUED : Single Precision vector instruction issued (executed) diff --git a/SOURCES/oprofile-power9.patch b/SOURCES/oprofile-power9.patch new file mode 100644 index 0000000..a1d96f6 --- /dev/null +++ b/SOURCES/oprofile-power9.patch @@ -0,0 +1,1189 @@ +commit ce5842f112d155a7148a44a7863cf4355c1385e0 +Author: Will Schmidt +Date: Tue Jun 6 10:52:00 2017 -0500 + + 1/2 Oprofile support for Power9 + + Hi, + + Add initial support for PPC64/Power9 to oprofile. + This includes the documentation updates to indicate "power9", + the PVR values, and all the other updates that will be necessary. + (This is based on a review of changes made when Power8 support + was initially added). + The long-ish event list follows as a subsequent patch. + + Signed-off-by: Will Schmidt + + -- + +diff --git a/doc/oprofile.xml b/doc/oprofile.xml +index 325ef6f..01930ab 100644 +--- a/doc/oprofile.xml ++++ b/doc/oprofile.xml +@@ -486,7 +486,7 @@ can be used for ocount, minus the count + Itaniumia64/itaniumCPU_CYCLES:100000:0:1:1 + Itanium 2ia64/itanium2CPU_CYCLES:100000:0:1:1 + TIMER_INTtimerNone selectable +-IBM pseriesPowerPC 4/5/6/7/8/970/CellCYCLES:100000:0:1:1 ++IBM pseriesppc64/power{ 4|5|6|7|8|9|970 }CYCLES:100000:0:1:1 + IBM s390timerNone selectable + IBM s390xtimerNone selectable + +diff --git a/events/Makefile.am b/events/Makefile.am +index db43550..b8f06af 100644 +--- a/events/Makefile.am ++++ b/events/Makefile.am +@@ -29,6 +29,7 @@ event_files = \ + ppc64/power6/events ppc64/power6/event_mappings ppc64/power6/unit_masks \ + ppc64/power7/events ppc64/power7/event_mappings ppc64/power7/unit_masks \ + ppc64/power8/events ppc64/power8/unit_masks \ ++ ppc64/power9/events ppc64/power9/unit_masks \ + ppc64/970/events ppc64/970/event_mappings ppc64/970/unit_masks \ + ppc64/970MP/events ppc64/970MP/event_mappings ppc64/970MP/unit_masks \ + ppc64/ibm-compat-v1/events ppc64/ibm-compat-v1/event_mappings ppc64/ibm-compat-v1/unit_masks \ +diff --git a/events/ppc64/power9/events b/events/ppc64/power9/events +new file mode 100644 +index 0000000..a2071e7 +--- /dev/null ++++ b/events/ppc64/power9/events +@@ -0,0 +1,8 @@ ++ ++# Copyright OProfile authors ++# Copyright (c) International Business Machines, 2017. ++# Contributed by Will Schmidt . ++# ++# IBM POWER9 Events ++ ++include:ppc64/architected_events_v1 +diff --git a/events/ppc64/power9/unit_masks b/events/ppc64/power9/unit_masks +new file mode 100644 +index 0000000..e384695 +--- /dev/null ++++ b/events/ppc64/power9/unit_masks +@@ -0,0 +1,9 @@ ++# ++# Copyright OProfile authors ++# Copyright (c) International Business Machines, 2017. ++# Contributed by Will Schmidt ++# ++# ppc64 POWER9 possible unit masks ++# ++name:zero type:mandatory default:0x0 ++ 0x0 No unit mask +diff --git a/libop/op_cpu_type.c b/libop/op_cpu_type.c +index 3d3c9c8..7acecda 100644 +--- a/libop/op_cpu_type.c ++++ b/libop/op_cpu_type.c +@@ -119,6 +119,7 @@ static struct cpu_descr const cpu_descrs[MAX_CPU_TYPE] = { + { "ARM Cortex-A53", "arm/armv8-ca53", CPU_ARM_V8_CA53, 6}, + { "Intel Skylake microarchitecture", "i386/skylake", CPU_SKYLAKE, 4 }, + { "Intel Goldmont microarchitecture", "i386/goldmont", CPU_GOLDMONT, 4 }, ++ { "ppc64 POWER9", "ppc64/power9", CPU_PPC64_POWER9, 6 }, + }; + + static size_t const nr_cpu_descrs = sizeof(cpu_descrs) / sizeof(struct cpu_descr); +@@ -292,6 +293,8 @@ static op_cpu _try_ppc64_arch_generic_cpu(void) + */ + if ((strcmp(platform, "power7") == 0) && (strcmp(base_platform, "power8") == 0)) + cpu_type = CPU_PPC64_POWER8; ++ else if ((strcmp(platform, "power8") == 0) && (strcmp(base_platform, "power9") == 0)) ++ cpu_type = CPU_PPC64_POWER9; + else + cpu_type = CPU_PPC64_ARCH_V1; + } +diff --git a/libop/op_cpu_type.h b/libop/op_cpu_type.h +index 78eb9bc..39b7726 100644 +--- a/libop/op_cpu_type.h ++++ b/libop/op_cpu_type.h +@@ -105,6 +105,7 @@ typedef enum { + CPU_ARM_V8_CA53, /* ARM Cortex-A53 */ + CPU_SKYLAKE, /** < Intel Skylake microarchitecture */ + CPU_GOLDMONT, /** < Intel Goldmont microarchitecture */ ++ CPU_PPC64_POWER9, /**< ppc64 POWER8 family */ + MAX_CPU_TYPE + } op_cpu; + +diff --git a/libop/op_events.c b/libop/op_events.c +index 0c7e9bc..0ba57e0 100644 +--- a/libop/op_events.c ++++ b/libop/op_events.c +@@ -1259,6 +1259,7 @@ void op_default_event(op_cpu cpu_type, struct op_default_event_descr * descr) + case CPU_PPC64_IBM_COMPAT_V1: + case CPU_PPC64_ARCH_V1: + case CPU_PPC64_POWER8: ++ case CPU_PPC64_POWER9: + descr->name = "CYCLES"; + break; + +diff --git a/utils/ophelp.c b/utils/ophelp.c +index ff4e3f1..6eb299c 100644 +--- a/utils/ophelp.c ++++ b/utils/ophelp.c +@@ -680,6 +680,12 @@ int main(int argc, char const * argv[]) + "http://www-306.ibm.com/chips/techlib/techlib.nsf/products/Cell_Broadband_Engine\n"; + break; + ++ case CPU_PPC64_POWER9: ++ event_doc = ++ "This processor type is fully supported with operf.\n" ++ "See Power ISA 3.0 at https://www.power.org/\n\n"; ++ break; ++ + case CPU_MIPS_20K: + event_doc = + "See Programming the MIPS64 20Kc Processor Core User's " +commit f5a8f00559e030fd874ab4e046814d638343b054 +Author: Will Schmidt +Date: Tue Jun 6 10:52:22 2017 -0500 + + 2/2 Oprofile support for Power9 (event list) + + Hi, + + Add the Oprofile event list for ppc64/power9. + + As indicated in the comments below, the event list is preliminary + at this time, and may or may not have subsequent updates. + + Thanks, + + Signed-off-by: Will Schmidt + + -- + +diff --git a/events/ppc64/power9/events b/events/ppc64/power9/events +index a2071e7..7264515 100644 +--- a/events/ppc64/power9/events ++++ b/events/ppc64/power9/events +@@ -6,3 +6,987 @@ + # IBM POWER9 Events + + include:ppc64/architected_events_v1 ++ ++# This table has been automatically generated with a preliminary list of ++# events, and is subject to verification and update. ++# Last Refresh. ( will schmidt , Jun 06,2017 ). ++ ++# Abbreviation hints: ++# BHS - Branch History Table ++# DARQ - Data and Address Recycle/Recirculation Queue ++# ERAT - Effective to Real Address Translation ++# FAB - Fabric ++# HPT - Hardware Page Table ++# IBUFF - Instruction Fetch Buffer ++# IFAR - Instruction Fetch Address Register ++# LHS/LDHITST -Load Hit Store ++# MEPF - PreFetch. ++# NTC - Next To Complete ++# NTF - Next To Finish. ++# PMU - Performance Monitor Unit ++# RIS - Random Instruction Sampling ++# rty - retry ++# TAGE - Tagged Geometric History Length predictor (branch prediction) ++# TM - Transactional Memory ++ ++ ++event:0x0000045050 counters:3 um:zero minimum:10000 name:PM_1FLOP_CMPL : one flop (fadd, fmul, fsub, fcmp, fsel, fabs, fnabs, fres, fsqrte, fneg) operation completed ++event:0x00000100F2 counters:0 um:zero minimum:10000 name:PM_1PLUS_PPC_CMPL : 1 or more ppc insts finished ++event:0x00000400F2 counters:3 um:zero minimum:10000 name:PM_1PLUS_PPC_DISP : Cycles at least one Instr Dispatched ++event:0x000004D052 counters:3 um:zero minimum:10000 name:PM_2FLOP_CMPL : DP vector version of fmul, fsub, fcmp, fsel, fabs, fnabs, fres ,fsqrte, fneg ++event:0x0000045052 counters:3 um:zero minimum:10000 name:PM_4FLOP_CMPL : 4 FLOP instruction completed ++event:0x000004D054 counters:3 um:zero minimum:10000 name:PM_8FLOP_CMPL : 8 FLOP instruction completed ++event:0x00000100FA counters:0 um:zero minimum:10000 name:PM_ANY_THRD_RUN_CYC : Cycles in which at least one thread has the run latch set ++event:0x000002505E counters:1 um:zero minimum:10000 name:PM_BACK_BR_CMPL : Branch instruction completed with a target address less than current instruction address ++event:0x0000004880 counters:0,1,2,3 um:zero minimum:10000 name:PM_BANK_CONFLICT : Read blocked due to interleave conflict. The ifar logic will detect an interleave conflict and kill the data that was read that cycle. ++event:0x000003005C counters:2 um:zero minimum:10000 name:PM_BFU_BUSY : Cycles in which all 4 Binary Floating Point units are busy. The BFU is running at capacity ++event:0x0000020036 counters:1 um:zero minimum:10000 name:PM_BR_2PATH : Branches that are not strongly biased ++event:0x0000040036 counters:3 um:zero minimum:10000 name:PM_BR_2PATH : Branches that are not strongly biased ++event:0x000004D05E counters:3 um:zero minimum:10000 name:PM_BR_CMPL : Any Branch instruction completed ++event:0x000000489C counters:0,1,2,3 um:zero minimum:10000 name:PM_BR_CORECT_PRED_TAKEN_CMPL : Conditional Branch Completed in which the HW correctly predicted the direction as taken. Counted at completion time ++event:0x00000040AC counters:0,1,2,3 um:zero minimum:10000 name:PM_BR_MPRED_CCACHE : Conditional Branch Completed that was Mispredicted due to the Count Cache Target Prediction ++event:0x00000400F6 counters:3 um:zero minimum:10000 name:PM_BR_MPRED_CMPL : Number of Branch Mispredicts ++event:0x00000048AC counters:0,1,2,3 um:zero minimum:10000 name:PM_BR_MPRED_LSTACK : Conditional Branch Completed that was Mispredicted due to the Link Stack Target Prediction ++event:0x00000048B0 counters:0,1,2,3 um:zero minimum:10000 name:PM_BR_MPRED_PCACHE : Conditional Branch Completed that was Mispredicted due to pattern cache prediction ++event:0x00000040B8 counters:0,1,2,3 um:zero minimum:10000 name:PM_BR_MPRED_TAKEN_CR : A Conditional Branch that resolved to taken was mispredicted as not taken (due to the BHT Direction Prediction). ++event:0x00000048B8 counters:0,1,2,3 um:zero minimum:10000 name:PM_BR_MPRED_TAKEN_TA : Conditional Branch Completed that was Mispredicted due to the Target Address Prediction from the Count Cache or Link Stack. Only XL-form branches that resolved Taken set this event. ++event:0x000000409C counters:0,1,2,3 um:zero minimum:10000 name:PM_BR_PRED : Conditional Branch Executed in which the HW predicted the Direction or Target. Includes taken and not taken and is counted at execution time ++event:0x00000040A4 counters:0,1,2,3 um:zero minimum:10000 name:PM_BR_PRED_CCACHE : Conditional Branch Completed that used the Count Cache for Target Prediction ++event:0x00000040A8 counters:0,1,2,3 um:zero minimum:10000 name:PM_BR_PRED_LSTACK : Conditional Branch Completed that used the Link Stack for Target Prediction ++event:0x00000048A0 counters:0,1,2,3 um:zero minimum:10000 name:PM_BR_PRED_PCACHE : Conditional branch completed that used pattern cache prediction ++event:0x00000040B4 counters:0,1,2,3 um:zero minimum:10000 name:PM_BR_PRED_TA : Conditional Branch Completed that had its target address predicted. Only XL-form branches set this event. This equal the sum of CCACHE, LSTACK, and PCACHE ++event:0x00000040B0 counters:0,1,2,3 um:zero minimum:10000 name:PM_BR_PRED_TAKEN_CR : Conditional Branch that had its direction predicted. I-form branches do not set this event. In addition, B-form branches which do not use the BHT do not set this event - these are branches with BO-field set to 'always taken' and branches ++event:0x00000200FA counters:1 um:zero minimum:10000 name:PM_BR_TAKEN_CMPL : New event for Branch Taken ++event:0x0000010068 counters:0 um:zero minimum:10000 name:PM_BRU_FIN : Branch Instruction Finished ++event:0x00000040A0 counters:0,1,2,3 um:zero minimum:10000 name:PM_BR_UNCOND : Unconditional Branch Completed. HW branch prediction was not used for this branch. This can be an I-form branch, a B-form branch with BO-field set to branch always, or a B-form branch which was covenrted to a Resolve. ++event:0x00000050B0 counters:0,1,2,3 um:zero minimum:10000 name:PM_BTAC_BAD_RESULT : BTAC thinks branch will be taken but it is either predicted not-taken by the BHT, or the target address is wrong (less common). In both cases, a redirect will happen ++event:0x00000058B0 counters:0,1,2,3 um:zero minimum:10000 name:PM_BTAC_GOOD_RESULT : BTAC predicts a taken branch and the BHT agrees, and the target address is correct ++event:0x0000010050 counters:0 um:zero minimum:10000 name:PM_CHIP_PUMP_CPRED : Initial and Final Pump Scope was chip pump (prediction=correct) for all data types excluding data prefetch (demand load,inst prefetch,inst fetch,xlate) ++event:0x000000208C counters:0,1,2,3 um:zero minimum:10000 name:PM_CLB_HELD : CLB (control logic block - indicates quadword fetch block) Hold: Any Reason ++event:0x000001E054 counters:0 um:zero minimum:10000 name:PM_CMPLU_STALL : Nothing completed and ICT not empty ++event:0x000001E05A counters:0 um:zero minimum:10000 name:PM_CMPLU_STALL_ANY_SYNC : Cycles in which the NTC sync instruction (isync, lwsync or hwsync) is not allowed to complete ++event:0x000004D018 counters:3 um:zero minimum:10000 name:PM_CMPLU_STALL_BRU : Completion stall due to a Branch Unit ++event:0x000004C01E counters:3 um:zero minimum:10000 name:PM_CMPLU_STALL_CRYPTO : Finish stall because the NTF instruction was routed to the crypto execution pipe and was waiting to finish ++event:0x000002C012 counters:1 um:zero minimum:10000 name:PM_CMPLU_STALL_DCACHE_MISS : Finish stall because the NTF instruction was a load that missed the L1 and was waiting for the data to return from the nest ++event:0x000001005A counters:0 um:zero minimum:10000 name:PM_CMPLU_STALL_DFLONG : Finish stall because the NTF instruction was a multi-cycle instruction issued to the Decimal Floating Point execution pipe and waiting to finish. Includes decimal floating point instructions + 128 bit binary floating point instructions. Qualified by multicycle ++event:0x000002D012 counters:1 um:zero minimum:10000 name:PM_CMPLU_STALL_DFU : Finish stall because the NTF instruction was issued to the Decimal Floating Point execution pipe and waiting to finish. Includes decimal floating point instructions + 128 bit binary floating point instructions. Not qualified by multicycle ++event:0x000002C018 counters:1 um:zero minimum:10000 name:PM_CMPLU_STALL_DMISS_L21_L31 : Completion stall by Dcache miss which resolved on chip ( excluding local L2/L3) ++event:0x000001003C counters:0 um:zero minimum:10000 name:PM_CMPLU_STALL_DMISS_L2L3 : Completion stall by Dcache miss which resolved in L2/L3 ++event:0x000004C016 counters:3 um:zero minimum:10000 name:PM_CMPLU_STALL_DMISS_L2L3_CONFLICT : Completion stall due to cache miss that resolves in the L2 or L3 with a conflict ++event:0x000004C01A counters:3 um:zero minimum:10000 name:PM_CMPLU_STALL_DMISS_L3MISS : Completion stall due to cache miss resolving missed the L3 ++event:0x0000030038 counters:2 um:zero minimum:10000 name:PM_CMPLU_STALL_DMISS_LMEM : Completion stall due to cache miss that resolves in local memory ++event:0x000002C01C counters:1 um:zero minimum:10000 name:PM_CMPLU_STALL_DMISS_REMOTE : Completion stall by Dcache miss which resolved from remote chip (cache or memory) ++event:0x000001005C counters:0 um:zero minimum:10000 name:PM_CMPLU_STALL_DP : Finish stall because the NTF instruction was a scalar instruction issued to the Double Precision execution pipe and waiting to finish. Includes binary floating point instructions in 32 and 64 bit binary floating point format. Not qualified multicycle. Qualified by NOT vector ++event:0x000003405C counters:2 um:zero minimum:10000 name:PM_CMPLU_STALL_DPLONG : Finish stall because the NTF instruction was a scalar multi-cycle instruction issued to the Double Precision execution pipe and waiting to finish. Includes binary floating point instructions in 32 and 64 bit binary floating point format. Qualified by NOT vector AND multicycle ++event:0x000004D01A counters:3 um:zero minimum:10000 name:PM_CMPLU_STALL_EIEIO : Finish stall because the NTF instruction is an EIEIO waiting for response from L2 ++event:0x0000030004 counters:2 um:zero minimum:10000 name:PM_CMPLU_STALL_EMQ_FULL : Finish stall because the next to finish instruction suffered an ERAT miss and the EMQ was full ++event:0x000004C012 counters:3 um:zero minimum:10000 name:PM_CMPLU_STALL_ERAT_MISS : Finish stall because the NTF instruction was a load or store that suffered a translation miss ++event:0x000003003A counters:2 um:zero minimum:10000 name:PM_CMPLU_STALL_EXCEPTION : Cycles in which the NTC instruction is not allowed to complete because it was interrupted by ANY exception, which has to be serviced before the instruction can complete ++event:0x000002D018 counters:1 um:zero minimum:10000 name:PM_CMPLU_STALL_EXEC_UNIT : Completion stall due to execution units (FXU/VSU/CRU) ++event:0x000001E056 counters:0 um:zero minimum:10000 name:PM_CMPLU_STALL_FLUSH_ANY_THREAD : Cycles in which the NTC instruction is not allowed to complete because any of the 4 threads in the same core suffered a flush, which blocks completion ++event:0x000004D016 counters:3 um:zero minimum:10000 name:PM_CMPLU_STALL_FXLONG : Completion stall due to a long latency scalar fixed point instruction (division, square root) ++event:0x000002D016 counters:1 um:zero minimum:10000 name:PM_CMPLU_STALL_FXU : Finish stall due to a scalar fixed point or CR instruction in the execution pipeline. These instructions get routed to the ALU, ALU2, and DIV pipes ++event:0x0000030036 counters:2 um:zero minimum:10000 name:PM_CMPLU_STALL_HWSYNC : completion stall due to hwsync ++event:0x000001002A counters:0 um:zero minimum:10000 name:PM_CMPLU_STALL_LARX : Finish stall because the NTF instruction was a larx waiting to be satisfied ++event:0x000002C01A counters:1 um:zero minimum:10000 name:PM_CMPLU_STALL_LHS : Finish stall because the NTF instruction was a load that hit on an older store and it was waiting for store data ++event:0x000004C014 counters:3 um:zero minimum:10000 name:PM_CMPLU_STALL_LMQ_FULL : Finish stall because the NTF instruction was a load that missed in the L1 and the LMQ was unable to accept this load miss request because it was full ++event:0x000004D014 counters:3 um:zero minimum:10000 name:PM_CMPLU_STALL_LOAD_FINISH : Finish stall because the NTF instruction was a load instruction with all its dependencies satisfied just going through the LSU pipe to finish ++event:0x000002D014 counters:1 um:zero minimum:10000 name:PM_CMPLU_STALL_LRQ_FULL : Finish stall because the NTF instruction was a load that was held in LSAQ (load-store address queue) because the LRQ (load-reorder queue) was full ++event:0x0000010004 counters:0 um:zero minimum:10000 name:PM_CMPLU_STALL_LRQ_OTHER : Finish stall due to LRQ miscellaneous reasons, lost arbitration to LMQ slot, bank collisions, set prediction cleanup, set prediction multihit and others ++event:0x000004E016 counters:3 um:zero minimum:10000 name:PM_CMPLU_STALL_LSAQ_ARB : Finish stall because the NTF instruction was a load or store that was held in LSAQ because an older instruction from SRQ or LRQ won arbitration to the LSU pipe when this instruction tried to launch ++event:0x000002C010 counters:1 um:zero minimum:10000 name:PM_CMPLU_STALL_LSU : Completion stall by LSU instruction ++event:0x000001003A counters:0 um:zero minimum:10000 name:PM_CMPLU_STALL_LSU_FIN : Finish stall because the NTF instruction was an LSU op (other than a load or a store) with all its dependencies met and just going through the LSU pipe to finish ++event:0x000002E01A counters:1 um:zero minimum:10000 name:PM_CMPLU_STALL_LSU_FLUSH_NEXT : Completion stall of one cycle because the LSU requested to flush the next iop in the sequence. It takes 1 cycle for the ISU to process this request before the LSU instruction is allowed to complete ++event:0x0000034056 counters:2 um:zero minimum:10000 name:PM_CMPLU_STALL_LSU_MFSPR : Finish stall because the NTF instruction was a mfspr instruction targeting an LSU SPR and it was waiting for the register data to be returned ++event:0x0000010036 counters:0 um:zero minimum:10000 name:PM_CMPLU_STALL_LWSYNC : completion stall due to lwsync ++event:0x000004E012 counters:3 um:zero minimum:10000 name:PM_CMPLU_STALL_MTFPSCR : Completion stall because the ISU is updating the register and notifying the Effective Address Table (EAT) ++event:0x000001E05C counters:0 um:zero minimum:10000 name:PM_CMPLU_STALL_NESTED_TBEGIN : Completion stall because the ISU is updating the TEXASR to keep track of the nested tbegin. This is a short delay, and it includes ROT ++event:0x000003003C counters:2 um:zero minimum:10000 name:PM_CMPLU_STALL_NESTED_TEND : Completion stall because the ISU is updating the TEXASR to keep track of the nested tend and decrement the TEXASR nested level. This is a short delay ++event:0x000004E018 counters:3 um:zero minimum:10000 name:PM_CMPLU_STALL_NTC_DISP_FIN : Finish stall because the NTF instruction was one that must finish at dispatch. ++event:0x000002E01E counters:1 um:zero minimum:10000 name:PM_CMPLU_STALL_NTC_FLUSH : Completion stall due to ntc flush ++event:0x0000030006 counters:2 um:zero minimum:10000 name:PM_CMPLU_STALL_OTHER_CMPL : Instructions the core completed while this tread was stalled ++event:0x000002C016 counters:1 um:zero minimum:10000 name:PM_CMPLU_STALL_PASTE : Finish stall because the NTF instruction was a paste waiting for response from L2 ++event:0x000003000A counters:2 um:zero minimum:10000 name:PM_CMPLU_STALL_PM : Finish stall because the NTF instruction was issued to the Permute execution pipe and waiting to finish. Includes permute and decimal fixed point instructions (128 bit BCD arithmetic) + a few 128 bit fixpoint add/subtract instructions with carry. Not qualified by vector or multicycle ++event:0x000001E052 counters:0 um:zero minimum:10000 name:PM_CMPLU_STALL_SLB : Finish stall because the NTF instruction was awaiting L2 response for an SLB ++event:0x0000030028 counters:2 um:zero minimum:10000 name:PM_CMPLU_STALL_SPEC_FINISH : Finish stall while waiting for the non-speculative finish of either a stcx waiting for its result or a load waiting for non-critical sectors of data and ECC ++event:0x0000030016 counters:2 um:zero minimum:10000 name:PM_CMPLU_STALL_SRQ_FULL : Finish stall because the NTF instruction was a store that was held in LSAQ because the SRQ was full ++event:0x000002D01C counters:1 um:zero minimum:10000 name:PM_CMPLU_STALL_STCX : Finish stall because the NTF instruction was a stcx waiting for response from L2 ++event:0x000004C01C counters:3 um:zero minimum:10000 name:PM_CMPLU_STALL_ST_FWD : Completion stall due to store forward ++event:0x0000030026 counters:2 um:zero minimum:10000 name:PM_CMPLU_STALL_STORE_DATA : Finish stall because the next to finish instruction was a store waiting on data ++event:0x0000030014 counters:2 um:zero minimum:10000 name:PM_CMPLU_STALL_STORE_FIN_ARB : Finish stall because the NTF instruction was a store waiting for a slot in the store finish pipe. This means the instruction is ready to finish but there are instructions ahead of it, using the finish pipe ++event:0x000002C014 counters:1 um:zero minimum:10000 name:PM_CMPLU_STALL_STORE_FINISH : Finish stall because the NTF instruction was a store with all its dependencies met, just waiting to go through the LSU pipe to finish ++event:0x000004C010 counters:3 um:zero minimum:10000 name:PM_CMPLU_STALL_STORE_PIPE_ARB : Finish stall because the NTF instruction was a store waiting for the next relaunch opportunity after an internal reject. This means the instruction is ready to relaunch and tried once but lost arbitration ++event:0x000002C01E counters:1 um:zero minimum:10000 name:PM_CMPLU_STALL_SYNC_PMU_INT : Cycles in which the NTC instruction is waiting for a synchronous PMU interrupt ++event:0x000001E050 counters:0 um:zero minimum:10000 name:PM_CMPLU_STALL_TEND : Finish stall because the NTF instruction was a tend instruction awaiting response from L2 ++event:0x000001001C counters:0 um:zero minimum:10000 name:PM_CMPLU_STALL_THRD : Completion Stalled because the thread was blocked ++event:0x000002E01C counters:1 um:zero minimum:10000 name:PM_CMPLU_STALL_TLBIE : Finish stall because the NTF instruction was a tlbie waiting for response from L2 ++event:0x000004405C counters:3 um:zero minimum:10000 name:PM_CMPLU_STALL_VDP : Finish stall because the NTF instruction was a vector instruction issued to the Double Precision execution pipe and waiting to finish. Includes binary floating point instructions in 32 and 64 bit binary floating point format. Not qualified multicycle. Qualified by vector ++event:0x000003C05A counters:2 um:zero minimum:10000 name:PM_CMPLU_STALL_VDPLONG : Finish stall because the NTF instruction was a scalar multi-cycle instruction issued to the Double Precision execution pipe and waiting to finish. Includes binary floating point instructions in 32 and 64 bit binary floating point format. Qualified by NOT vector AND multicycle ++event:0x000002E018 counters:1 um:zero minimum:10000 name:PM_CMPLU_STALL_VFXLONG : Completion stall due to a long latency vector fixed point instruction (division, square root) ++event:0x000003C05C counters:2 um:zero minimum:10000 name:PM_CMPLU_STALL_VFXU : Finish stall due to a vector fixed point instruction in the execution pipeline. These instructions get routed to the ALU, ALU2, and DIV pipes ++event:0x000003608C counters:2 um:zero minimum:10000 name:PM_CO0_BUSY : CO mach 0 Busy. Used by PMU to sample ave CO lifetime (mach0 used as sample point) ++event:0x000004608C counters:3 um:zero minimum:10000 name:PM_CO0_BUSY : CO mach 0 Busy. Used by PMU to sample ave CO lifetime (mach0 used as sample point) ++event:0x0000016886 counters:0 um:zero minimum:10000 name:PM_CO_DISP_FAIL : CO dispatch failed due to all CO machines being busy ++event:0x0000026086 counters:1 um:zero minimum:10000 name:PM_CO_TM_SC_FOOTPRINT : L2 did a cleanifdirty CO to the L3 (ie created an SC line in the L3) OR L2 TM_store hit dirty HPC line and L3 indicated SC line formed in L3 on RDR bus ++event:0x000002688C counters:1 um:zero minimum:10000 name:PM_CO_USAGE : Continuous 16 cycle (2to1) window where this signals rotates thru sampling each CO machine busy. PMU uses this wave to then do 16 cyc count to sample total number of machs running ++event:0x000001001E counters:0 um:zero minimum:10000 name:PM_CYC : Processor cycles ++event:0x000002001E counters:1 um:zero minimum:10000 name:PM_CYC : Processor cycles ++event:0x000003001E counters:2 um:zero minimum:10000 name:PM_CYC : Processor cycles ++event:0x000004001E counters:3 um:zero minimum:10000 name:PM_CYC : Processor cycles ++event:0x000004D04A counters:3 um:zero minimum:10000 name:PM_DARQ0_0_3_ENTRIES : Cycles in which 3 or less DARQ entries (out of 12) are in use ++event:0x000001D058 counters:0 um:zero minimum:10000 name:PM_DARQ0_10_12_ENTRIES : Cycles in which 10 or more DARQ entries (out of 12) are in use ++event:0x000003504E counters:2 um:zero minimum:10000 name:PM_DARQ0_4_6_ENTRIES : Cycles in which 4, 5, or 6 DARQ entries (out of 12) are in use ++event:0x000002E050 counters:1 um:zero minimum:10000 name:PM_DARQ0_7_9_ENTRIES : Cycles in which 7,8, or 9 DARQ entries (out of 12) are in use ++event:0x000004C122 counters:3 um:zero minimum:10000 name:PM_DARQ1_0_3_ENTRIES : Cycles in which 3 or fewer DARQ1 entries (out of 12) are in use ++event:0x0000020058 counters:1 um:zero minimum:10000 name:PM_DARQ1_10_12_ENTRIES : Cycles in which 10 or more DARQ1 entries (out of 12) are in use ++event:0x000003E050 counters:2 um:zero minimum:10000 name:PM_DARQ1_4_6_ENTRIES : Cycles in which 4, 5, or 6 DARQ1 entries (out of 12) are in use ++event:0x000002005A counters:1 um:zero minimum:10000 name:PM_DARQ1_7_9_ENTRIES : Cycles in which 7 to 9 DARQ1 entries (out of 12) are in use ++event:0x000004405E counters:3 um:zero minimum:10000 name:PM_DARQ_STORE_REJECT : The DARQ attempted to transmit a store into an LSAQ or SRQ entry but It was rejected. Divide by PM_DARQ_STORE_XMIT to get reject ratio ++event:0x0000030064 counters:2 um:zero minimum:10000 name:PM_DARQ_STORE_XMIT : The DARQ attempted to transmit a store into an LSAQ or SRQ entry. Includes rejects. Not qualified by thread, so it includes counts for the whole core ++event:0x000001C050 counters:0 um:zero minimum:10000 name:PM_DATA_CHIP_PUMP_CPRED : Initial and Final Pump Scope was chip pump (prediction=correct) for a demand load ++event:0x000004C048 counters:3 um:zero minimum:10000 name:PM_DATA_FROM_DL2L3_MOD : The processor's data cache was reloaded with Modified (M) data from another chip's L2 or L3 on a different Node or Group (Distant), as this chip due to a demand load ++event:0x000003C048 counters:2 um:zero minimum:10000 name:PM_DATA_FROM_DL2L3_SHR : The processor's data cache was reloaded with Shared (S) data from another chip's L2 or L3 on a different Node or Group (Distant), as this chip due to a demand load ++event:0x000003C04C counters:2 um:zero minimum:10000 name:PM_DATA_FROM_DL4 : The processor's data cache was reloaded from another chip's L4 on a different Node or Group (Distant) due to a demand load ++event:0x000004C04C counters:3 um:zero minimum:10000 name:PM_DATA_FROM_DMEM : The processor's data cache was reloaded from another chip's memory on the same Node or Group (Distant) due to a demand load ++event:0x000001C042 counters:0 um:zero minimum:10000 name:PM_DATA_FROM_L2 : The processor's data cache was reloaded from local core's L2 due to a demand load ++event:0x000004C046 counters:3 um:zero minimum:10000 name:PM_DATA_FROM_L21_MOD : The processor's data cache was reloaded with Modified (M) data from another core's L2 on the same chip due to a demand load ++event:0x000003C046 counters:2 um:zero minimum:10000 name:PM_DATA_FROM_L21_SHR : The processor's data cache was reloaded with Shared (S) data from another core's L2 on the same chip due to a demand load ++event:0x000003C040 counters:2 um:zero minimum:10000 name:PM_DATA_FROM_L2_DISP_CONFLICT_LDHITST : The processor's data cache was reloaded from local core's L2 with load hit store conflict due to a demand load ++event:0x000004C040 counters:3 um:zero minimum:10000 name:PM_DATA_FROM_L2_DISP_CONFLICT_OTHER : The processor's data cache was reloaded from local core's L2 with dispatch conflict due to a demand load ++event:0x000002C040 counters:1 um:zero minimum:10000 name:PM_DATA_FROM_L2_MEPF : The processor's data cache was reloaded from local core's L2 hit without dispatch conflicts on Mepf state due to a demand load ++event:0x00000200FE counters:1 um:zero minimum:10000 name:PM_DATA_FROM_L2MISS : Demand LD - L2 Miss (not L2 hit) ++event:0x000001C04E counters:0 um:zero minimum:10000 name:PM_DATA_FROM_L2MISS_MOD : The processor's data cache was reloaded from a location other than the local core's L2 due to a demand load ++event:0x000001C040 counters:0 um:zero minimum:10000 name:PM_DATA_FROM_L2_NO_CONFLICT : The processor's data cache was reloaded from local core's L2 without conflict due to a demand load ++event:0x000004C042 counters:3 um:zero minimum:10000 name:PM_DATA_FROM_L3 : The processor's data cache was reloaded from local core's L3 due to a demand load ++event:0x000004C044 counters:3 um:zero minimum:10000 name:PM_DATA_FROM_L31_ECO_MOD : The processor's data cache was reloaded with Modified (M) data from another core's ECO L3 on the same chip due to a demand load ++event:0x000003C044 counters:2 um:zero minimum:10000 name:PM_DATA_FROM_L31_ECO_SHR : The processor's data cache was reloaded with Shared (S) data from another core's ECO L3 on the same chip due to a demand load ++event:0x000002C044 counters:1 um:zero minimum:10000 name:PM_DATA_FROM_L31_MOD : The processor's data cache was reloaded with Modified (M) data from another core's L3 on the same chip due to a demand load ++event:0x000001C046 counters:0 um:zero minimum:10000 name:PM_DATA_FROM_L31_SHR : The processor's data cache was reloaded with Shared (S) data from another core's L3 on the same chip due to a demand load ++event:0x000003C042 counters:2 um:zero minimum:10000 name:PM_DATA_FROM_L3_DISP_CONFLICT : The processor's data cache was reloaded from local core's L3 with dispatch conflict due to a demand load ++event:0x000002C042 counters:1 um:zero minimum:10000 name:PM_DATA_FROM_L3_MEPF : The processor's data cache was reloaded from local core's L3 without dispatch conflicts hit on Mepf state due to a demand load ++event:0x00000300FE counters:2 um:zero minimum:10000 name:PM_DATA_FROM_L3MISS : Demand LD - L3 Miss (not L2 hit and not L3 hit) ++event:0x000004C04E counters:3 um:zero minimum:10000 name:PM_DATA_FROM_L3MISS_MOD : The processor's data cache was reloaded from a location other than the local core's L3 due to a demand load ++event:0x000001C044 counters:0 um:zero minimum:10000 name:PM_DATA_FROM_L3_NO_CONFLICT : The processor's data cache was reloaded from local core's L3 without conflict due to a demand load ++event:0x000001C04C counters:0 um:zero minimum:10000 name:PM_DATA_FROM_LL4 : The processor's data cache was reloaded from the local chip's L4 cache due to a demand load ++event:0x000002C048 counters:1 um:zero minimum:10000 name:PM_DATA_FROM_LMEM : The processor's data cache was reloaded from the local chip's Memory due to a demand load ++event:0x00000400FE counters:3 um:zero minimum:10000 name:PM_DATA_FROM_MEMORY : The processor's data cache was reloaded from a memory location including L4 from local remote or distant due to a demand load ++event:0x000004C04A counters:3 um:zero minimum:10000 name:PM_DATA_FROM_OFF_CHIP_CACHE : The processor's data cache was reloaded either shared or modified data from another core's L2/L3 on a different chip (remote or distant) due to a demand load ++event:0x000001C048 counters:0 um:zero minimum:10000 name:PM_DATA_FROM_ON_CHIP_CACHE : The processor's data cache was reloaded either shared or modified data from another core's L2/L3 on the same chip due to a demand load ++event:0x000002C046 counters:1 um:zero minimum:10000 name:PM_DATA_FROM_RL2L3_MOD : The processor's data cache was reloaded with Modified (M) data from another chip's L2 or L3 on the same Node or Group (Remote), as this chip due to a demand load ++event:0x000001C04A counters:0 um:zero minimum:10000 name:PM_DATA_FROM_RL2L3_SHR : The processor's data cache was reloaded with Shared (S) data from another chip's L2 or L3 on the same Node or Group (Remote), as this chip due to a demand load ++event:0x000002C04A counters:1 um:zero minimum:10000 name:PM_DATA_FROM_RL4 : The processor's data cache was reloaded from another chip's L4 on the same Node or Group ( Remote) due to a demand load ++event:0x000003C04A counters:2 um:zero minimum:10000 name:PM_DATA_FROM_RMEM : The processor's data cache was reloaded from another chip's memory on the same Node or Group ( Remote) due to a demand load ++event:0x000002C050 counters:1 um:zero minimum:10000 name:PM_DATA_GRP_PUMP_CPRED : Initial and Final Pump Scope was group pump (prediction=correct) for a demand load ++event:0x000002C052 counters:1 um:zero minimum:10000 name:PM_DATA_GRP_PUMP_MPRED : Final Pump Scope (Group) ended up either larger or smaller than Initial Pump Scope for a demand load ++event:0x000001C052 counters:0 um:zero minimum:10000 name:PM_DATA_GRP_PUMP_MPRED_RTY : Final Pump Scope (Group) ended up larger than Initial Pump Scope (Chip) for a demand load ++event:0x000001C054 counters:0 um:zero minimum:10000 name:PM_DATA_PUMP_CPRED : Pump prediction correct. Counts across all types of pumps for a demand load ++event:0x000004C052 counters:3 um:zero minimum:10000 name:PM_DATA_PUMP_MPRED : Pump misprediction. Counts across all types of pumps for a demand load ++event:0x000000F0A0 counters:0,1,2,3 um:zero minimum:10000 name:PM_DATA_STORE : All ops that drain from s2q to L2 containing data ++event:0x000003C050 counters:2 um:zero minimum:10000 name:PM_DATA_SYS_PUMP_CPRED : Initial and Final Pump Scope was system pump (prediction=correct) for a demand load ++event:0x000003C052 counters:2 um:zero minimum:10000 name:PM_DATA_SYS_PUMP_MPRED : Final Pump Scope (system) mispredicted. Either the original scope was too small (Chip/Group) or the original scope was System and it should have been smaller. Counts for a demand load ++event:0x000004C050 counters:3 um:zero minimum:10000 name:PM_DATA_SYS_PUMP_MPRED_RTY : Final Pump Scope (system) ended up larger than Initial Pump Scope (Chip/Group) for a demand load ++event:0x000003001A counters:2 um:zero minimum:10000 name:PM_DATA_TABLEWALK_CYC : Data Tablewalk Cycles. Could be 1 or 2 active tablewalks. Includes data prefetches. ++event:0x000000F8AC counters:0,1,2,3 um:zero minimum:10000 name:PM_DC_DEALLOC_NO_CONF : A demand load referenced a line in an active fuzzy prefetch stream. The stream could have been allocated through the hardware prefetch mechanism or through software.Fuzzy stream confirm (out of order effects, or pf cant keep up) ++event:0x000000F0A8 counters:0,1,2,3 um:zero minimum:10000 name:PM_DC_PREF_CONF : A demand load referenced a line in an active prefetch stream. The stream could have been allocated through the hardware prefetch mechanism or through software. Includes forwards and backwards streams ++event:0x000000F0B4 counters:0,1,2,3 um:zero minimum:10000 name:PM_DC_PREF_CONS_ALLOC : Prefetch stream allocated in the conservative phase by either the hardware prefetch mechanism or software prefetch ++event:0x000000F8A8 counters:0,1,2,3 um:zero minimum:10000 name:PM_DC_PREF_FUZZY_CONF : A demand load referenced a line in an active fuzzy prefetch stream. The stream could have been allocated through the hardware prefetch mechanism or through software.Fuzzy stream confirm (out of order effects, or pf cant keep up) ++event:0x000000F0A4 counters:0,1,2,3 um:zero minimum:10000 name:PM_DC_PREF_HW_ALLOC : Prefetch stream allocated by the hardware prefetch mechanism ++event:0x000000F0AC counters:0,1,2,3 um:zero minimum:10000 name:PM_DC_PREF_STRIDED_CONF : A demand load referenced a line in an active strided prefetch stream. The stream could have been allocated through the hardware prefetch mechanism or through software. ++event:0x000000F8A4 counters:0,1,2,3 um:zero minimum:10000 name:PM_DC_PREF_SW_ALLOC : Prefetch stream allocated by software prefetching ++event:0x000000F8B4 counters:0,1,2,3 um:zero minimum:10000 name:PM_DC_PREF_XCONS_ALLOC : Prefetch stream allocated in the Ultra conservative phase by either the hardware prefetch mechanism or software prefetch ++event:0x00000048B4 counters:0,1,2,3 um:zero minimum:10000 name:PM_DECODE_FUSION_CONST_GEN : 32-bit constant generation ++event:0x0000005084 counters:0,1,2,3 um:zero minimum:10000 name:PM_DECODE_FUSION_EXT_ADD : 32-bit extended addition ++event:0x00000048A8 counters:0,1,2,3 um:zero minimum:10000 name:PM_DECODE_FUSION_LD_ST_DISP : 32-bit displacement D-form and 16-bit displacement X-form ++event:0x0000005088 counters:0,1,2,3 um:zero minimum:10000 name:PM_DECODE_FUSION_OP_PRESERV : Destructive op operand preservation ++event:0x00000058A8 counters:0,1,2,3 um:zero minimum:10000 name:PM_DECODE_HOLD_ICT_FULL : Counts the number of cycles in which the IFU was not able to decode and transmit one or more instructions because all itags were in use. This means the ICT is full for this thread ++event:0x0000005884 counters:0,1,2,3 um:zero minimum:10000 name:PM_DECODE_LANES_NOT_AVAIL : Decode has something to transmit but dispatch lanes are not available ++event:0x000004C054 counters:3 um:zero minimum:10000 name:PM_DERAT_MISS_16G : Data ERAT Miss (Data TLB Access) page size 16G ++event:0x000003C054 counters:2 um:zero minimum:10000 name:PM_DERAT_MISS_16M : Data ERAT Miss (Data TLB Access) page size 16M ++event:0x000002C05A counters:1 um:zero minimum:10000 name:PM_DERAT_MISS_1G : Data ERAT Miss (Data TLB Access) page size 1G. Implies radix translation ++event:0x000001C05A counters:0 um:zero minimum:10000 name:PM_DERAT_MISS_2M : Data ERAT Miss (Data TLB Access) page size 2M. Implies radix translation ++event:0x000001C056 counters:0 um:zero minimum:10000 name:PM_DERAT_MISS_4K : Data ERAT Miss (Data TLB Access) page size 4K ++event:0x000002C054 counters:1 um:zero minimum:10000 name:PM_DERAT_MISS_64K : Data ERAT Miss (Data TLB Access) page size 64K ++event:0x000004D04C counters:3 um:zero minimum:10000 name:PM_DFU_BUSY : Cycles in which all 4 Decimal Floating Point units are busy. The DFU is running at capacity ++event:0x000000288C counters:0,1,2,3 um:zero minimum:10000 name:PM_DISP_CLB_HELD_BAL : Dispatch/CLB Hold: Balance Flush ++event:0x0000002090 counters:0,1,2,3 um:zero minimum:10000 name:PM_DISP_CLB_HELD_SB : Dispatch/CLB Hold: Scoreboard ++event:0x0000002890 counters:0,1,2,3 um:zero minimum:10000 name:PM_DISP_CLB_HELD_TLBIE : Dispatch Hold: Due to TLBIE ++event:0x0000010006 counters:0 um:zero minimum:10000 name:PM_DISP_HELD : Dispatch Held ++event:0x000003D05C counters:2 um:zero minimum:10000 name:PM_DISP_HELD_HB_FULL : Dispatch held due to History Buffer full. Could be GPR/VSR/VMR/FPR/CR/XVF ++event:0x0000020006 counters:1 um:zero minimum:10000 name:PM_DISP_HELD_ISSQ_FULL : Dispatch held due to Issue q full. Includes issue queue and branch queue ++event:0x000004003C counters:3 um:zero minimum:10000 name:PM_DISP_HELD_SYNC_HOLD : Cycles in which dispatch is held because of a synchronizing instruction in the pipeline ++event:0x00000028B0 counters:0,1,2,3 um:zero minimum:10000 name:PM_DISP_HELD_TBEGIN : This outer tbegin transaction cannot be dispatched until the previous tend instruction completes ++event:0x0000030008 counters:2 um:zero minimum:10000 name:PM_DISP_STARVED : Dispatched Starved ++event:0x000004D05C counters:3 um:zero minimum:10000 name:PM_DP_QP_FLOP_CMPL : Double-Precion or Quad-Precision instruction completed ++event:0x000004E048 counters:3 um:zero minimum:10000 name:PM_DPTEG_FROM_DL2L3_MOD : A Page Table Entry was loaded into the TLB with Modified (M) data from another chip's L2 or L3 on a different Node or Group (Distant), as this chip due to a data side request. When using Radix Page Translation, this count excludes PDE reloads. Only PTE reloads are included ++event:0x000003E048 counters:2 um:zero minimum:10000 name:PM_DPTEG_FROM_DL2L3_SHR : A Page Table Entry was loaded into the TLB with Shared (S) data from another chip's L2 or L3 on a different Node or Group (Distant), as this chip due to a data side request. When using Radix Page Translation, this count excludes PDE reloads. Only PTE reloads are included ++event:0x000003E04C counters:2 um:zero minimum:10000 name:PM_DPTEG_FROM_DL4 : A Page Table Entry was loaded into the TLB from another chip's L4 on a different Node or Group (Distant) due to a data side request. When using Radix Page Translation, this count excludes PDE reloads. Only PTE reloads are included ++event:0x000004E04C counters:3 um:zero minimum:10000 name:PM_DPTEG_FROM_DMEM : A Page Table Entry was loaded into the TLB from another chip's memory on the same Node or Group (Distant) due to a data side request. When using Radix Page Translation, this count excludes PDE reloads. Only PTE reloads are included ++event:0x000001E042 counters:0 um:zero minimum:10000 name:PM_DPTEG_FROM_L2 : A Page Table Entry was loaded into the TLB from local core's L2 due to a data side request. When using Radix Page Translation, this count excludes PDE reloads. Only PTE reloads are included ++event:0x000004E046 counters:3 um:zero minimum:10000 name:PM_DPTEG_FROM_L21_MOD : A Page Table Entry was loaded into the TLB with Modified (M) data from another core's L2 on the same chip due to a data side request. When using Radix Page Translation, this count excludes PDE reloads. Only PTE reloads are included ++event:0x000003E046 counters:2 um:zero minimum:10000 name:PM_DPTEG_FROM_L21_SHR : A Page Table Entry was loaded into the TLB with Shared (S) data from another core's L2 on the same chip due to a data side request. When using Radix Page Translation, this count excludes PDE reloads. Only PTE reloads are included ++event:0x000002E040 counters:1 um:zero minimum:10000 name:PM_DPTEG_FROM_L2_MEPF : A Page Table Entry was loaded into the TLB from local core's L2 hit without dispatch conflicts on Mepf state. due to a data side request. When using Radix Page Translation, this count excludes PDE reloads. Only PTE reloads are included ++event:0x000001E04E counters:0 um:zero minimum:10000 name:PM_DPTEG_FROM_L2MISS : A Page Table Entry was loaded into the TLB from a location other than the local core's L2 due to a data side request. When using Radix Page Translation, this count excludes PDE reloads. Only PTE reloads are included ++event:0x000001E040 counters:0 um:zero minimum:10000 name:PM_DPTEG_FROM_L2_NO_CONFLICT : A Page Table Entry was loaded into the TLB from local core's L2 without conflict due to a data side request. When using Radix Page Translation, this count excludes PDE reloads. Only PTE reloads are included ++event:0x000004E042 counters:3 um:zero minimum:10000 name:PM_DPTEG_FROM_L3 : A Page Table Entry was loaded into the TLB from local core's L3 due to a data side request. When using Radix Page Translation, this count excludes PDE reloads. Only PTE reloads are included ++event:0x000004E044 counters:3 um:zero minimum:10000 name:PM_DPTEG_FROM_L31_ECO_MOD : A Page Table Entry was loaded into the TLB with Modified (M) data from another core's ECO L3 on the same chip due to a data side request. When using Radix Page Translation, this count excludes PDE reloads. Only PTE reloads are included ++event:0x000003E044 counters:2 um:zero minimum:10000 name:PM_DPTEG_FROM_L31_ECO_SHR : A Page Table Entry was loaded into the TLB with Shared (S) data from another core's ECO L3 on the same chip due to a data side request. When using Radix Page Translation, this count excludes PDE reloads. Only PTE reloads are included ++event:0x000002E044 counters:1 um:zero minimum:10000 name:PM_DPTEG_FROM_L31_MOD : A Page Table Entry was loaded into the TLB with Modified (M) data from another core's L3 on the same chip due to a data side request. When using Radix Page Translation, this count excludes PDE reloads. Only PTE reloads are included ++event:0x000001E046 counters:0 um:zero minimum:10000 name:PM_DPTEG_FROM_L31_SHR : A Page Table Entry was loaded into the TLB with Shared (S) data from another core's L3 on the same chip due to a data side request. When using Radix Page Translation, this count excludes PDE reloads. Only PTE reloads are included ++event:0x000003E042 counters:2 um:zero minimum:10000 name:PM_DPTEG_FROM_L3_DISP_CONFLICT : A Page Table Entry was loaded into the TLB from local core's L3 with dispatch conflict due to a data side request. When using Radix Page Translation, this count excludes PDE reloads. Only PTE reloads are included ++event:0x000002E042 counters:1 um:zero minimum:10000 name:PM_DPTEG_FROM_L3_MEPF : A Page Table Entry was loaded into the TLB from local core's L3 without dispatch conflicts hit on Mepf state. due to a data side request. When using Radix Page Translation, this count excludes PDE reloads. Only PTE reloads are included ++event:0x000004E04E counters:3 um:zero minimum:10000 name:PM_DPTEG_FROM_L3MISS : A Page Table Entry was loaded into the TLB from a location other than the local core's L3 due to a data side request. When using Radix Page Translation, this count excludes PDE reloads. Only PTE reloads are included ++event:0x000001E044 counters:0 um:zero minimum:10000 name:PM_DPTEG_FROM_L3_NO_CONFLICT : A Page Table Entry was loaded into the TLB from local core's L3 without conflict due to a data side request. When using Radix Page Translation, this count excludes PDE reloads. Only PTE reloads are included ++event:0x000001E04C counters:0 um:zero minimum:10000 name:PM_DPTEG_FROM_LL4 : A Page Table Entry was loaded into the TLB from the local chip's L4 cache due to a data side request. When using Radix Page Translation, this count excludes PDE reloads. Only PTE reloads are included ++event:0x000002E048 counters:1 um:zero minimum:10000 name:PM_DPTEG_FROM_LMEM : A Page Table Entry was loaded into the TLB from the local chip's Memory due to a data side request. When using Radix Page Translation, this count excludes PDE reloads. Only PTE reloads are included ++event:0x000002E04C counters:1 um:zero minimum:10000 name:PM_DPTEG_FROM_MEMORY : A Page Table Entry was loaded into the TLB from a memory location including L4 from local remote or distant due to a data side request. When using Radix Page Translation, this count excludes PDE reloads. Only PTE reloads are included ++event:0x000004E04A counters:3 um:zero minimum:10000 name:PM_DPTEG_FROM_OFF_CHIP_CACHE : A Page Table Entry was loaded into the TLB either shared or modified data from another core's L2/L3 on a different chip (remote or distant) due to a data side request. When using Radix Page Translation, this count excludes PDE reloads. Only PTE reloads are included ++event:0x000001E048 counters:0 um:zero minimum:10000 name:PM_DPTEG_FROM_ON_CHIP_CACHE : A Page Table Entry was loaded into the TLB either shared or modified data from another core's L2/L3 on the same chip due to a data side request. When using Radix Page Translation, this count excludes PDE reloads. Only PTE reloads are included ++event:0x000002E046 counters:1 um:zero minimum:10000 name:PM_DPTEG_FROM_RL2L3_MOD : A Page Table Entry was loaded into the TLB with Modified (M) data from another chip's L2 or L3 on the same Node or Group (Remote), as this chip due to a data side request. When using Radix Page Translation, this count excludes PDE reloads. Only PTE reloads are included ++event:0x000001E04A counters:0 um:zero minimum:10000 name:PM_DPTEG_FROM_RL2L3_SHR : A Page Table Entry was loaded into the TLB with Shared (S) data from another chip's L2 or L3 on the same Node or Group (Remote), as this chip due to a data side request. When using Radix Page Translation, this count excludes PDE reloads. Only PTE reloads are included ++event:0x000002E04A counters:1 um:zero minimum:10000 name:PM_DPTEG_FROM_RL4 : A Page Table Entry was loaded into the TLB from another chip's L4 on the same Node or Group ( Remote) due to a data side request. When using Radix Page Translation, this count excludes PDE reloads. Only PTE reloads are included ++event:0x000003E04A counters:2 um:zero minimum:10000 name:PM_DPTEG_FROM_RMEM : A Page Table Entry was loaded into the TLB from another chip's memory on the same Node or Group ( Remote) due to a data side request. When using Radix Page Translation, this count excludes PDE reloads. Only PTE reloads are included ++event:0x0000036092 counters:2 um:zero minimum:10000 name:PM_DSIDE_L2MEMACC : Valid when first beat of data comes in for an D-side fetch where data came EXCLUSIVELY from memory (excluding hpcread64 accesses), i.e., total memory accesses by RCs ++event:0x0000026884 counters:1 um:zero minimum:10000 name:PM_DSIDE_MRU_TOUCH : D-side L2 MRU touch sent to L2 ++event:0x0000036892 counters:2 um:zero minimum:10000 name:PM_DSIDE_OTHER_64B_L2MEMACC : Valid when first beat of data comes in for an D-side fetch where data came EXCLUSIVELY from memory that was for hpc_read64, (RC had to fetch other 64B of a line from MC) i.e., number of times RC had to go to memory to get 'missing' 64B ++event:0x000000D0A8 counters:0,1,2,3 um:zero minimum:10000 name:PM_DSLB_MISS : Data SLB Miss - Total of all segment sizes ++event:0x0000010016 counters:0 um:zero minimum:10000 name:PM_DSLB_MISS : gate_and(sd_pc_c0_comp_valid AND sd_pc_c0_comp_thread(0:1)=tid,sd_pc_c0_comp_ppc_count(0:3)) + gate_and(sd_pc_c1_comp_valid AND sd_pc_c1_comp_thread(0:1)=tid,sd_pc_c1_comp_ppc_count(0:3)) ++event:0x00000300FC counters:2 um:zero minimum:10000 name:PM_DTLB_MISS : Data PTEG reload ++event:0x000001C058 counters:0 um:zero minimum:10000 name:PM_DTLB_MISS_16G : Data TLB Miss page size 16G ++event:0x000004C056 counters:3 um:zero minimum:10000 name:PM_DTLB_MISS_16M : Data TLB Miss page size 16M ++event:0x000004C05A counters:3 um:zero minimum:10000 name:PM_DTLB_MISS_1G : Data TLB reload (after a miss) page size 1G. Implies radix translation was used ++event:0x000001C05C counters:0 um:zero minimum:10000 name:PM_DTLB_MISS_2M : Data TLB reload (after a miss) page size 2M. Implies radix translation was used ++event:0x000002C056 counters:1 um:zero minimum:10000 name:PM_DTLB_MISS_4K : Data TLB Miss page size 4k ++event:0x000003C056 counters:2 um:zero minimum:10000 name:PM_DTLB_MISS_64K : Data TLB Miss page size 64K ++event:0x00000050A8 counters:0,1,2,3 um:zero minimum:10000 name:PM_EAT_FORCE_MISPRED : XL-form branch was mispredicted due to the predicted target address missing from EAT. The EAT forces a mispredict in this case since there is no predicated target to validate. This is a rare case that may occur when the EAT is full and a branch is issued ++event:0x0000004084 counters:0,1,2,3 um:zero minimum:10000 name:PM_EAT_FULL_CYC : Cycles No room in EAT ++event:0x0000002080 counters:0,1,2,3 um:zero minimum:10000 name:PM_EE_OFF_EXT_INT : CyclesMSR[EE] is off and external interrupts are active ++event:0x00000200F8 counters:1 um:zero minimum:10000 name:PM_EXT_INT : external interrupt ++event:0x000004505E counters:3 um:zero minimum:10000 name:PM_FLOP_CMPL : Floating Point Operation Finished ++event:0x00000400F8 counters:3 um:zero minimum:10000 name:PM_FLUSH : Flush (any type) ++event:0x0000030012 counters:2 um:zero minimum:10000 name:PM_FLUSH_COMPLETION : The instruction that was next to complete did not complete because it suffered a flush ++event:0x0000002880 counters:0,1,2,3 um:zero minimum:10000 name:PM_FLUSH_DISP : Dispatch flush ++event:0x0000002088 counters:0,1,2,3 um:zero minimum:10000 name:PM_FLUSH_DISP_SB : Dispatch Flush: Scoreboard ++event:0x0000002888 counters:0,1,2,3 um:zero minimum:10000 name:PM_FLUSH_DISP_TLBIE : Dispatch Flush: TLBIE ++event:0x0000002084 counters:0,1,2,3 um:zero minimum:10000 name:PM_FLUSH_HB_RESTORE_CYC : Cycles in which no new instructions can be dispatched to the ICT after a flush. History buffer recovery ++event:0x00000058A4 counters:0,1,2,3 um:zero minimum:10000 name:PM_FLUSH_LSU : LSU flushes. Includes all lsu flushes ++event:0x00000050A4 counters:0,1,2,3 um:zero minimum:10000 name:PM_FLUSH_MPRED : Branch mispredict flushes. Includes target and address misprecition ++event:0x0000045054 counters:3 um:zero minimum:10000 name:PM_FMA_CMPL : two flops operation completed (fmadd, fnmadd, fmsub, fnmsub) Scalar instructions only. ++event:0x000000509C counters:0,1,2,3 um:zero minimum:10000 name:PM_FORCED_NOP : Instruction was forced to execute as a nop because it was found to behave like a nop (have no effect) at decode time ++event:0x000003000C counters:2 um:zero minimum:10000 name:PM_FREQ_DOWN : Power Management: Below Threshold B ++event:0x000004000C counters:3 um:zero minimum:10000 name:PM_FREQ_UP : Power Management: Above Threshold A ++event:0x000003000E counters:2 um:zero minimum:10000 name:PM_FXU_1PLUS_BUSY : At least one of the 4 FXU units is busy ++event:0x000002000E counters:1 um:zero minimum:10000 name:PM_FXU_BUSY : Cycles in which all 4 FXUs are busy. The FXU is running at capacity ++event:0x0000040004 counters:3 um:zero minimum:10000 name:PM_FXU_FIN : The fixed point unit Unit finished an instruction. Instructions that finish may not necessary complete. ++event:0x0000024052 counters:1 um:zero minimum:10000 name:PM_FXU_IDLE : Cycles in which FXU0, FXU1, FXU2, and FXU3 are all idle ++event:0x0000020050 counters:1 um:zero minimum:10000 name:PM_GRP_PUMP_CPRED : Initial and Final Pump Scope and data sourced across this scope was group pump for all data types excluding data prefetch (demand load,inst prefetch,inst fetch,xlate) ++event:0x0000020052 counters:1 um:zero minimum:10000 name:PM_GRP_PUMP_MPRED : Final Pump Scope (Group) ended up either larger or smaller than Initial Pump Scope for all data types excluding data prefetch (demand load,inst prefetch,inst fetch,xlate) ++event:0x0000010052 counters:0 um:zero minimum:10000 name:PM_GRP_PUMP_MPRED_RTY : Final Pump Scope (Group) ended up larger than Initial Pump Scope (Chip) for all data types excluding data prefetch (demand load,inst prefetch,inst fetch,xlate) ++event:0x000002000A counters:1 um:zero minimum:10000 name:PM_HV_CYC : Cycles in which msr_hv is high. Note that this event does not take msr_pr into consideration ++event:0x00000050A0 counters:0,1,2,3 um:zero minimum:10000 name:PM_HWSYNC : Hwsync instruction decoded and transferred ++event:0x0000004884 counters:0,1,2,3 um:zero minimum:10000 name:PM_IBUF_FULL_CYC : Cycles No room in ibuff ++event:0x0000010018 counters:0 um:zero minimum:10000 name:PM_IC_DEMAND_CYC : Final Pump Scope (Group) ended up larger than Initial Pump Scope (Chip) for a demand load ++event:0x0000004098 counters:0,1,2,3 um:zero minimum:10000 name:PM_IC_DEMAND_L2_BHT_REDIRECT : L2 I cache demand request due to BHT redirect, branch redirect ( 2 bubbles 3 cycles) ++event:0x0000004898 counters:0,1,2,3 um:zero minimum:10000 name:PM_IC_DEMAND_L2_BR_REDIRECT : L2 I cache demand request due to branch Mispredict ( 15 cycle path) ++event:0x0000004088 counters:0,1,2,3 um:zero minimum:10000 name:PM_IC_DEMAND_REQ : Demand Instruction fetch request ++event:0x0000005888 counters:0,1,2,3 um:zero minimum:10000 name:PM_IC_INVALIDATE : Ic line invalidated ++event:0x0000045058 counters:3 um:zero minimum:10000 name:PM_IC_MISS_CMPL : Non-speculative icache miss, counted at completion ++event:0x0000005094 counters:0,1,2,3 um:zero minimum:10000 name:PM_IC_MISS_ICBI : threaded version, IC Misses where we got EA dir hit but no sector valids were on. ICBI took line out ++event:0x0000004890 counters:0,1,2,3 um:zero minimum:10000 name:PM_IC_PREF_CANCEL_HIT : Prefetch Canceled due to icache hit ++event:0x0000004094 counters:0,1,2,3 um:zero minimum:10000 name:PM_IC_PREF_CANCEL_L2 : L2 Squashed a demand or prefetch request ++event:0x0000004090 counters:0,1,2,3 um:zero minimum:10000 name:PM_IC_PREF_CANCEL_PAGE : Prefetch Canceled due to page boundary ++event:0x0000004888 counters:0,1,2,3 um:zero minimum:10000 name:PM_IC_PREF_REQ : Instruction prefetch requests ++event:0x000000488C counters:0,1,2,3 um:zero minimum:10000 name:PM_IC_PREF_WRITE : Instruction prefetch written into IL1 ++event:0x0000004894 counters:0,1,2,3 um:zero minimum:10000 name:PM_IC_RELOAD_PRIVATE : Reloading line was brought in private for a specific thread. Most lines are brought in shared for all eight threads. If RA does not match then invalidates and then brings it shared to other thread. In P7 line brought in private , then line was invalidat ++event:0x0000020008 counters:1 um:zero minimum:10000 name:PM_ICT_EMPTY_CYC : Cycles in which the ICT is completely empty. No itags are assigned to any thread ++event:0x000004D01E counters:3 um:zero minimum:10000 name:PM_ICT_NOSLOT_BR_MPRED : Ict empty for this thread due to branch mispred ++event:0x0000034058 counters:2 um:zero minimum:10000 name:PM_ICT_NOSLOT_BR_MPRED_ICMISS : Ict empty for this thread due to Icache Miss and branch mispred ++event:0x00000100F8 counters:0 um:zero minimum:10000 name:PM_ICT_NOSLOT_CYC : Number of cycles the ICT has no itags assigned to this thread ++event:0x000004E01A counters:3 um:zero minimum:10000 name:PM_ICT_NOSLOT_DISP_HELD : Cycles in which the NTC instruction is held at dispatch for any reason ++event:0x0000030018 counters:2 um:zero minimum:10000 name:PM_ICT_NOSLOT_DISP_HELD_HB_FULL : Ict empty for this thread due to dispatch holds because the History Buffer was full. Could be GPR/VSR/VMR/FPR/CR/XVF ++event:0x000002D01E counters:1 um:zero minimum:10000 name:PM_ICT_NOSLOT_DISP_HELD_ISSQ : Ict empty for this thread due to dispatch hold on this thread due to Issue q full, BRQ full, XVCF Full, Count cache, Link, Tar full ++event:0x000004D01C counters:3 um:zero minimum:10000 name:PM_ICT_NOSLOT_DISP_HELD_SYNC : Dispatch held due to a synchronizing instruction at dispatch ++event:0x0000010064 counters:0 um:zero minimum:10000 name:PM_ICT_NOSLOT_DISP_HELD_TBEGIN : the NTC instruction is being held at dispatch because it is a tbegin instruction and there is an older tbegin in the pipeline that must complete before the younger tbegin can dispatch ++event:0x000003E052 counters:2 um:zero minimum:10000 name:PM_ICT_NOSLOT_IC_L3 : Ict empty for this thread due to icache misses that were sourced from the local L3 ++event:0x000004E010 counters:3 um:zero minimum:10000 name:PM_ICT_NOSLOT_IC_L3MISS : Ict empty for this thread due to icache misses that were sourced from beyond the local L3. The source could be local/remote/distant memory or another core's cache ++event:0x000002D01A counters:1 um:zero minimum:10000 name:PM_ICT_NOSLOT_IC_MISS : Ict empty for this thread due to Icache Miss ++event:0x00000100F6 counters:0 um:zero minimum:10000 name:PM_IERAT_RELOAD : Number of I-ERAT reloads ++event:0x000004006A counters:3 um:zero minimum:10000 name:PM_IERAT_RELOAD_16M : IERAT Reloaded (Miss) for a 16M page ++event:0x0000020064 counters:1 um:zero minimum:10000 name:PM_IERAT_RELOAD_4K : IERAT reloaded (after a miss) for 4K pages ++event:0x000003006A counters:2 um:zero minimum:10000 name:PM_IERAT_RELOAD_64K : IERAT Reloaded (Miss) for a 64k page ++event:0x000003405E counters:2 um:zero minimum:10000 name:PM_IFETCH_THROTTLE : Cycles in which Instruction fetch throttle was active. ++event:0x0000014050 counters:0 um:zero minimum:10000 name:PM_INST_CHIP_PUMP_CPRED : Initial and Final Pump Scope was chip pump (prediction=correct) for an instruction fetch ++event:0x0000010002 counters:0 um:zero minimum:100000 name:PM_INST_CMPL : Number of PowerPC Instructions that completed. ++event:0x0000020002 counters:1 um:zero minimum:100000 name:PM_INST_CMPL : Number of PowerPC Instructions that completed. ++event:0x0000030002 counters:2 um:zero minimum:100000 name:PM_INST_CMPL : Number of PowerPC Instructions that completed. ++event:0x0000040002 counters:3 um:zero minimum:100000 name:PM_INST_CMPL : Number of PowerPC Instructions that completed. ++event:0x00000200F2 counters:1 um:zero minimum:10000 name:PM_INST_DISP : # PPC Dispatched ++event:0x00000300F2 counters:2 um:zero minimum:10000 name:PM_INST_DISP : # PPC Dispatched ++event:0x0000044048 counters:3 um:zero minimum:10000 name:PM_INST_FROM_DL2L3_MOD : The processor's Instruction cache was reloaded with Modified (M) data from another chip's L2 or L3 on a different Node or Group (Distant), as this chip due to an instruction fetch (not prefetch) ++event:0x0000034048 counters:2 um:zero minimum:10000 name:PM_INST_FROM_DL2L3_SHR : The processor's Instruction cache was reloaded with Shared (S) data from another chip's L2 or L3 on a different Node or Group (Distant), as this chip due to an instruction fetch (not prefetch) ++event:0x000003404C counters:2 um:zero minimum:10000 name:PM_INST_FROM_DL4 : The processor's Instruction cache was reloaded from another chip's L4 on a different Node or Group (Distant) due to an instruction fetch (not prefetch) ++event:0x000004404C counters:3 um:zero minimum:10000 name:PM_INST_FROM_DMEM : The processor's Instruction cache was reloaded from another chip's memory on the same Node or Group (Distant) due to an instruction fetch (not prefetch) ++event:0x0000004080 counters:0,1,2,3 um:zero minimum:10000 name:PM_INST_FROM_L1 : Instruction fetches from L1. L1 instruction hit ++event:0x0000014042 counters:0 um:zero minimum:10000 name:PM_INST_FROM_L2 : The processor's Instruction cache was reloaded from local core's L2 due to an instruction fetch (not prefetch) ++event:0x0000044046 counters:3 um:zero minimum:10000 name:PM_INST_FROM_L21_MOD : The processor's Instruction cache was reloaded with Modified (M) data from another core's L2 on the same chip due to an instruction fetch (not prefetch) ++event:0x0000034046 counters:2 um:zero minimum:10000 name:PM_INST_FROM_L21_SHR : The processor's Instruction cache was reloaded with Shared (S) data from another core's L2 on the same chip due to an instruction fetch (not prefetch) ++event:0x0000034040 counters:2 um:zero minimum:10000 name:PM_INST_FROM_L2_DISP_CONFLICT_LDHITST : The processor's Instruction cache was reloaded from local core's L2 with load hit store conflict due to an instruction fetch (not prefetch) ++event:0x0000044040 counters:3 um:zero minimum:10000 name:PM_INST_FROM_L2_DISP_CONFLICT_OTHER : The processor's Instruction cache was reloaded from local core's L2 with dispatch conflict due to an instruction fetch (not prefetch) ++event:0x0000024040 counters:1 um:zero minimum:10000 name:PM_INST_FROM_L2_MEPF : The processor's Instruction cache was reloaded from local core's L2 hit without dispatch conflicts on Mepf state. due to an instruction fetch (not prefetch) ++event:0x000001404E counters:0 um:zero minimum:10000 name:PM_INST_FROM_L2MISS : The processor's Instruction cache was reloaded from a location other than the local core's L2 due to an instruction fetch (not prefetch) ++event:0x0000014040 counters:0 um:zero minimum:10000 name:PM_INST_FROM_L2_NO_CONFLICT : The processor's Instruction cache was reloaded from local core's L2 without conflict due to an instruction fetch (not prefetch) ++event:0x0000044042 counters:3 um:zero minimum:10000 name:PM_INST_FROM_L3 : The processor's Instruction cache was reloaded from local core's L3 due to an instruction fetch (not prefetch) ++event:0x0000044044 counters:3 um:zero minimum:10000 name:PM_INST_FROM_L31_ECO_MOD : The processor's Instruction cache was reloaded with Modified (M) data from another core's ECO L3 on the same chip due to an instruction fetch (not prefetch) ++event:0x0000034044 counters:2 um:zero minimum:10000 name:PM_INST_FROM_L31_ECO_SHR : The processor's Instruction cache was reloaded with Shared (S) data from another core's ECO L3 on the same chip due to an instruction fetch (not prefetch) ++event:0x0000024044 counters:1 um:zero minimum:10000 name:PM_INST_FROM_L31_MOD : The processor's Instruction cache was reloaded with Modified (M) data from another core's L3 on the same chip due to an instruction fetch (not prefetch) ++event:0x0000014046 counters:0 um:zero minimum:10000 name:PM_INST_FROM_L31_SHR : The processor's Instruction cache was reloaded with Shared (S) data from another core's L3 on the same chip due to an instruction fetch (not prefetch) ++event:0x0000034042 counters:2 um:zero minimum:10000 name:PM_INST_FROM_L3_DISP_CONFLICT : The processor's Instruction cache was reloaded from local core's L3 with dispatch conflict due to an instruction fetch (not prefetch) ++event:0x0000024042 counters:1 um:zero minimum:10000 name:PM_INST_FROM_L3_MEPF : The processor's Instruction cache was reloaded from local core's L3 without dispatch conflicts hit on Mepf state. due to an instruction fetch (not prefetch) ++event:0x00000300FA counters:2 um:zero minimum:10000 name:PM_INST_FROM_L3MISS : Marked instruction was reloaded from a location beyond the local chiplet ++event:0x000004404E counters:3 um:zero minimum:10000 name:PM_INST_FROM_L3MISS_MOD : The processor's Instruction cache was reloaded from a location other than the local core's L3 due to a instruction fetch ++event:0x0000014044 counters:0 um:zero minimum:10000 name:PM_INST_FROM_L3_NO_CONFLICT : The processor's Instruction cache was reloaded from local core's L3 without conflict due to an instruction fetch (not prefetch) ++event:0x000001404C counters:0 um:zero minimum:10000 name:PM_INST_FROM_LL4 : The processor's Instruction cache was reloaded from the local chip's L4 cache due to an instruction fetch (not prefetch) ++event:0x0000024048 counters:1 um:zero minimum:10000 name:PM_INST_FROM_LMEM : The processor's Instruction cache was reloaded from the local chip's Memory due to an instruction fetch (not prefetch) ++event:0x000002404C counters:1 um:zero minimum:10000 name:PM_INST_FROM_MEMORY : The processor's Instruction cache was reloaded from a memory location including L4 from local remote or distant due to an instruction fetch (not prefetch) ++event:0x000004404A counters:3 um:zero minimum:10000 name:PM_INST_FROM_OFF_CHIP_CACHE : The processor's Instruction cache was reloaded either shared or modified data from another core's L2/L3 on a different chip (remote or distant) due to an instruction fetch (not prefetch) ++event:0x0000014048 counters:0 um:zero minimum:10000 name:PM_INST_FROM_ON_CHIP_CACHE : The processor's Instruction cache was reloaded either shared or modified data from another core's L2/L3 on the same chip due to an instruction fetch (not prefetch) ++event:0x0000024046 counters:1 um:zero minimum:10000 name:PM_INST_FROM_RL2L3_MOD : The processor's Instruction cache was reloaded with Modified (M) data from another chip's L2 or L3 on the same Node or Group (Remote), as this chip due to an instruction fetch (not prefetch) ++event:0x000001404A counters:0 um:zero minimum:10000 name:PM_INST_FROM_RL2L3_SHR : The processor's Instruction cache was reloaded with Shared (S) data from another chip's L2 or L3 on the same Node or Group (Remote), as this chip due to an instruction fetch (not prefetch) ++event:0x000002404A counters:1 um:zero minimum:10000 name:PM_INST_FROM_RL4 : The processor's Instruction cache was reloaded from another chip's L4 on the same Node or Group ( Remote) due to an instruction fetch (not prefetch) ++event:0x000003404A counters:2 um:zero minimum:10000 name:PM_INST_FROM_RMEM : The processor's Instruction cache was reloaded from another chip's memory on the same Node or Group ( Remote) due to an instruction fetch (not prefetch) ++event:0x000002C05C counters:1 um:zero minimum:10000 name:PM_INST_GRP_PUMP_CPRED : Initial and Final Pump Scope was group pump (prediction=correct) for an instruction fetch (demand only) ++event:0x000002C05E counters:1 um:zero minimum:10000 name:PM_INST_GRP_PUMP_MPRED : Final Pump Scope (Group) ended up either larger or smaller than Initial Pump Scope for an instruction fetch (demand only) ++event:0x0000014052 counters:0 um:zero minimum:10000 name:PM_INST_GRP_PUMP_MPRED_RTY : Final Pump Scope (Group) ended up larger than Initial Pump Scope (Chip) for an instruction fetch ++event:0x000004001C counters:3 um:zero minimum:10000 name:PM_INST_IMC_MATCH_CMPL : IMC Match Count ++event:0x0000014054 counters:0 um:zero minimum:10000 name:PM_INST_PUMP_CPRED : Pump prediction correct. Counts across all types of pumps for an instruction fetch ++event:0x0000044052 counters:3 um:zero minimum:10000 name:PM_INST_PUMP_MPRED : Pump misprediction. Counts across all types of pumps for an instruction fetch ++event:0x0000034050 counters:2 um:zero minimum:10000 name:PM_INST_SYS_PUMP_CPRED : Initial and Final Pump Scope was system pump (prediction=correct) for an instruction fetch ++event:0x0000034052 counters:2 um:zero minimum:10000 name:PM_INST_SYS_PUMP_MPRED : Final Pump Scope (system) mispredicted. Either the original scope was too small (Chip/Group) or the original scope was System and it should have been smaller. Counts for an instruction fetch ++event:0x0000044050 counters:3 um:zero minimum:10000 name:PM_INST_SYS_PUMP_MPRED_RTY : Final Pump Scope (system) ended up larger than Initial Pump Scope (Chip/Group) for an instruction fetch ++event:0x0000024050 counters:1 um:zero minimum:100000 name:PM_IOPS_CMPL : Internal Operations completed ++event:0x0000045048 counters:3 um:zero minimum:10000 name:PM_IPTEG_FROM_DL2L3_MOD : A Page Table Entry was loaded into the TLB with Modified (M) data from another chip's L2 or L3 on a different Node or Group (Distant), as this chip due to a instruction side request ++event:0x0000035048 counters:2 um:zero minimum:10000 name:PM_IPTEG_FROM_DL2L3_SHR : A Page Table Entry was loaded into the TLB with Shared (S) data from another chip's L2 or L3 on a different Node or Group (Distant), as this chip due to a instruction side request ++event:0x000003504C counters:2 um:zero minimum:10000 name:PM_IPTEG_FROM_DL4 : A Page Table Entry was loaded into the TLB from another chip's L4 on a different Node or Group (Distant) due to a instruction side request ++event:0x000004504C counters:3 um:zero minimum:10000 name:PM_IPTEG_FROM_DMEM : A Page Table Entry was loaded into the TLB from another chip's memory on the same Node or Group (Distant) due to a instruction side request ++event:0x0000015042 counters:0 um:zero minimum:10000 name:PM_IPTEG_FROM_L2 : A Page Table Entry was loaded into the TLB from local core's L2 due to a instruction side request ++event:0x0000045046 counters:3 um:zero minimum:10000 name:PM_IPTEG_FROM_L21_MOD : A Page Table Entry was loaded into the TLB with Modified (M) data from another core's L2 on the same chip due to a instruction side request ++event:0x0000035046 counters:2 um:zero minimum:10000 name:PM_IPTEG_FROM_L21_SHR : A Page Table Entry was loaded into the TLB with Shared (S) data from another core's L2 on the same chip due to a instruction side request ++event:0x0000025040 counters:1 um:zero minimum:10000 name:PM_IPTEG_FROM_L2_MEPF : A Page Table Entry was loaded into the TLB from local core's L2 hit without dispatch conflicts on Mepf state. due to a instruction side request ++event:0x000001504E counters:0 um:zero minimum:10000 name:PM_IPTEG_FROM_L2MISS : A Page Table Entry was loaded into the TLB from a location other than the local core's L2 due to a instruction side request ++event:0x0000015040 counters:0 um:zero minimum:10000 name:PM_IPTEG_FROM_L2_NO_CONFLICT : A Page Table Entry was loaded into the TLB from local core's L2 without conflict due to a instruction side request ++event:0x0000045042 counters:3 um:zero minimum:10000 name:PM_IPTEG_FROM_L3 : A Page Table Entry was loaded into the TLB from local core's L3 due to a instruction side request ++event:0x0000045044 counters:3 um:zero minimum:10000 name:PM_IPTEG_FROM_L31_ECO_MOD : A Page Table Entry was loaded into the TLB with Modified (M) data from another core's ECO L3 on the same chip due to a instruction side request ++event:0x0000035044 counters:2 um:zero minimum:10000 name:PM_IPTEG_FROM_L31_ECO_SHR : A Page Table Entry was loaded into the TLB with Shared (S) data from another core's ECO L3 on the same chip due to a instruction side request ++event:0x0000025044 counters:1 um:zero minimum:10000 name:PM_IPTEG_FROM_L31_MOD : A Page Table Entry was loaded into the TLB with Modified (M) data from another core's L3 on the same chip due to a instruction side request ++event:0x0000015046 counters:0 um:zero minimum:10000 name:PM_IPTEG_FROM_L31_SHR : A Page Table Entry was loaded into the TLB with Shared (S) data from another core's L3 on the same chip due to a instruction side request ++event:0x0000035042 counters:2 um:zero minimum:10000 name:PM_IPTEG_FROM_L3_DISP_CONFLICT : A Page Table Entry was loaded into the TLB from local core's L3 with dispatch conflict due to a instruction side request ++event:0x0000025042 counters:1 um:zero minimum:10000 name:PM_IPTEG_FROM_L3_MEPF : A Page Table Entry was loaded into the TLB from local core's L3 without dispatch conflicts hit on Mepf state. due to a instruction side request ++event:0x000004504E counters:3 um:zero minimum:10000 name:PM_IPTEG_FROM_L3MISS : A Page Table Entry was loaded into the TLB from a location other than the local core's L3 due to a instruction side request ++event:0x0000015044 counters:0 um:zero minimum:10000 name:PM_IPTEG_FROM_L3_NO_CONFLICT : A Page Table Entry was loaded into the TLB from local core's L3 without conflict due to a instruction side request ++event:0x000001504C counters:0 um:zero minimum:10000 name:PM_IPTEG_FROM_LL4 : A Page Table Entry was loaded into the TLB from the local chip's L4 cache due to a instruction side request ++event:0x0000025048 counters:1 um:zero minimum:10000 name:PM_IPTEG_FROM_LMEM : A Page Table Entry was loaded into the TLB from the local chip's Memory due to a instruction side request ++event:0x000002504C counters:1 um:zero minimum:10000 name:PM_IPTEG_FROM_MEMORY : A Page Table Entry was loaded into the TLB from a memory location including L4 from local remote or distant due to a instruction side request ++event:0x000004504A counters:3 um:zero minimum:10000 name:PM_IPTEG_FROM_OFF_CHIP_CACHE : A Page Table Entry was loaded into the TLB either shared or modified data from another core's L2/L3 on a different chip (remote or distant) due to a instruction side request ++event:0x0000015048 counters:0 um:zero minimum:10000 name:PM_IPTEG_FROM_ON_CHIP_CACHE : A Page Table Entry was loaded into the TLB either shared or modified data from another core's L2/L3 on the same chip due to a instruction side request ++event:0x0000025046 counters:1 um:zero minimum:10000 name:PM_IPTEG_FROM_RL2L3_MOD : A Page Table Entry was loaded into the TLB with Modified (M) data from another chip's L2 or L3 on the same Node or Group (Remote), as this chip due to a instruction side request ++event:0x000001504A counters:0 um:zero minimum:10000 name:PM_IPTEG_FROM_RL2L3_SHR : A Page Table Entry was loaded into the TLB with Shared (S) data from another chip's L2 or L3 on the same Node or Group (Remote), as this chip due to a instruction side request ++event:0x000002504A counters:1 um:zero minimum:10000 name:PM_IPTEG_FROM_RL4 : A Page Table Entry was loaded into the TLB from another chip's L4 on the same Node or Group ( Remote) due to a instruction side request ++event:0x000003504A counters:2 um:zero minimum:10000 name:PM_IPTEG_FROM_RMEM : A Page Table Entry was loaded into the TLB from another chip's memory on the same Node or Group ( Remote) due to a instruction side request ++event:0x000001688A counters:0 um:zero minimum:10000 name:PM_ISIDE_DISP : All I-side dispatch attempts for this thread (excludes i_l2mru_tch_reqs) ++event:0x000002608A counters:1 um:zero minimum:10000 name:PM_ISIDE_DISP_FAIL_ADDR : All I-side dispatch attempts for this thread that failed due to a addr collision with another machine (excludes i_l2mru_tch_reqs) ++event:0x000002688A counters:1 um:zero minimum:10000 name:PM_ISIDE_DISP_FAIL_OTHER : All I-side dispatch attempts for this thread that failed due to a reason other than addrs collision (excludes i_l2mru_tch_reqs) ++event:0x0000026890 counters:1 um:zero minimum:10000 name:PM_ISIDE_L2MEMACC : Valid when first beat of data comes in for an I-side fetch where data came from memory ++event:0x0000046880 counters:3 um:zero minimum:10000 name:PM_ISIDE_MRU_TOUCH : I-side L2 MRU touch sent to L2 for this thread ++event:0x000000D8A8 counters:0,1,2,3 um:zero minimum:10000 name:PM_ISLB_MISS : Instruction SLB Miss - Total of all segment sizes ++event:0x0000040006 counters:3 um:zero minimum:10000 name:PM_ISLB_MISS : Number of ISLB misses for this thread ++event:0x000003005A counters:2 um:zero minimum:10000 name:PM_ISQ_0_8_ENTRIES : Cycles in which 8 or less Issue Queue entries are in use. This is a shared event, not per thread ++event:0x000004000A counters:3 um:zero minimum:10000 name:PM_ISQ_36_44_ENTRIES : Cycles in which 36 or more Issue Queue entries are in use. This is a shared event, not per thread. There are 44 issue queue entries across 4 slices in the whole core ++event:0x0000003080 counters:0,1,2,3 um:zero minimum:10000 name:PM_ISU0_ISS_HOLD_ALL : All ISU rejects ++event:0x0000003084 counters:0,1,2,3 um:zero minimum:10000 name:PM_ISU1_ISS_HOLD_ALL : All ISU rejects ++event:0x0000003880 counters:0,1,2,3 um:zero minimum:10000 name:PM_ISU2_ISS_HOLD_ALL : All ISU rejects ++event:0x0000003884 counters:0,1,2,3 um:zero minimum:10000 name:PM_ISU3_ISS_HOLD_ALL : All ISU rejects ++event:0x0000002884 counters:0,1,2,3 um:zero minimum:10000 name:PM_ISYNC : Isync completion count per thread ++event:0x00000400FC counters:3 um:zero minimum:10000 name:PM_ITLB_MISS : ITLB Reloaded. Counts 1 per ITLB miss for HPT but multiple for radix depending on number of levels traveresed ++event:0x000001002C counters:0 um:zero minimum:10000 name:PM_L1_DCACHE_RELOADED_ALL : L1 data cache reloaded for demand. If MMCR1[16] is 1, prefetches will be included as well ++event:0x00000300F6 counters:2 um:zero minimum:10000 name:PM_L1_DCACHE_RELOAD_VALID : DL1 reloaded due to Demand Load ++event:0x000000408C counters:0,1,2,3 um:zero minimum:10000 name:PM_L1_DEMAND_WRITE : Instruction Demand sectors written into IL1 ++event:0x00000200FD counters:1 um:zero minimum:10000 name:PM_L1_ICACHE_MISS : Demand iCache Miss ++event:0x0000040012 counters:3 um:zero minimum:10000 name:PM_L1_ICACHE_RELOADED_ALL : Counts all Icache reloads includes demand, prefetch, prefetch turned into demand and demand turned into prefetch ++event:0x0000030068 counters:2 um:zero minimum:10000 name:PM_L1_ICACHE_RELOADED_PREF : Counts all Icache prefetch reloads ( includes demand turned into prefetch) ++event:0x0000016890 counters:0 um:zero minimum:10000 name:PM_L1PF_L2MEMACC : Valid when first beat of data comes in for an L1PF where data came from memory ++event:0x0000020054 counters:1 um:zero minimum:10000 name:PM_L1_PREF : A data line was written to the L1 due to a hardware or software prefetch ++event:0x000000E880 counters:0,1,2,3 um:zero minimum:10000 name:PM_L1_SW_PREF : Software L1 Prefetches, including SW Transient Prefetches ++event:0x0000016082 counters:0 um:zero minimum:10000 name:PM_L2_CASTOUT_MOD : L2 Castouts - Modified (M,Mu,Me) ++event:0x0000016882 counters:0 um:zero minimum:10000 name:PM_L2_CASTOUT_SHR : L2 Castouts - Shared (Tx,Sx) ++event:0x0000046088 counters:3 um:zero minimum:10000 name:PM_L2_CHIP_PUMP : RC requests that were local (aka chip) pump attempts ++event:0x0000026882 counters:1 um:zero minimum:10000 name:PM_L2_DC_INV : D-cache invalidates sent over the reload bus to the core ++event:0x0000046080 counters:3 um:zero minimum:10000 name:PM_L2_DISP_ALL_L2MISS : All successful Ld/St dispatches for this thread that were an L2 miss (excludes i_l2mru_tch_reqs) ++event:0x0000046888 counters:3 um:zero minimum:10000 name:PM_L2_GROUP_PUMP : RC requests that were on group (aka nodel) pump attempts ++event:0x0000026088 counters:1 um:zero minimum:10000 name:PM_L2_GRP_GUESS_CORRECT : L2 guess grp (GS or NNS) and guess was correct (data intra-group AND ^on-chip) ++event:0x0000026888 counters:1 um:zero minimum:10000 name:PM_L2_GRP_GUESS_WRONG : L2 guess grp (GS or NNS) and guess was not correct (ie data on-chip OR beyond-group) ++event:0x0000026082 counters:1 um:zero minimum:10000 name:PM_L2_IC_INV : I-cache Invalidates sent over the realod bus to the core ++event:0x0000036080 counters:2 um:zero minimum:10000 name:PM_L2_INST : All successful I-side dispatches for this thread (excludes i_l2mru_tch reqs) ++event:0x000003609E counters:2 um:zero minimum:10000 name:PM_L2_INST : All successful I-side dispatches for this thread (excludes i_l2mru_tch reqs) ++event:0x0000036880 counters:2 um:zero minimum:10000 name:PM_L2_INST_MISS : All successful I-side dispatches that were an L2 miss for this thread (excludes i_l2mru_tch reqs) ++event:0x000004609E counters:3 um:zero minimum:10000 name:PM_L2_INST_MISS : All successful I-side dispatches that were an L2 miss for this thread (excludes i_l2mru_tch reqs) ++event:0x0000016080 counters:0 um:zero minimum:10000 name:PM_L2_LD : All successful D-side Load dispatches for this thread (L2 miss + L2 hits) ++event:0x000001609E counters:0 um:zero minimum:10000 name:PM_L2_LD_DISP : All successful D-side load dispatches for this thread (L2 miss + L2 hits) ++event:0x0000036082 counters:2 um:zero minimum:10000 name:PM_L2_LD_DISP : All successful I-or-D side load dispatches for this thread (excludes i_l2mru_tch_reqs) ++event:0x000002609E counters:1 um:zero minimum:10000 name:PM_L2_LD_HIT : All successful D-side load dispatches that were L2 hits for this thread ++event:0x0000036882 counters:2 um:zero minimum:10000 name:PM_L2_LD_HIT : All successful I-or-D side load dispatches for this thread that were L2 hits (excludes i_l2mru_tch_reqs) ++event:0x0000026080 counters:1 um:zero minimum:10000 name:PM_L2_LD_MISS : All successful D-Side Load dispatches that were an L2 miss for this thread ++event:0x0000016092 counters:0 um:zero minimum:10000 name:PM_L2_LD_MISS_128B : All successful D-side load dispatches that were an L2 miss (NOT Sx,Tx,Mx) for this thread and the RC calculated the request should be for 128B (i.e., M=0) ++event:0x0000026092 counters:1 um:zero minimum:10000 name:PM_L2_LD_MISS_64B : All successful D-side load dispatches that were an L2 miss (NOT Sx,Tx,Mx) for this thread and the RC calculated the request should be for 64B(i.e., M=1) ++event:0x0000016088 counters:0 um:zero minimum:10000 name:PM_L2_LOC_GUESS_CORRECT : L2 guess local (LNS) and guess was correct (ie data local) ++event:0x0000016888 counters:0 um:zero minimum:10000 name:PM_L2_LOC_GUESS_WRONG : L2 guess local (LNS) and guess was not correct (ie data not on chip) ++event:0x0000016084 counters:0 um:zero minimum:10000 name:PM_L2_RCLD_DISP : All I-or-D side load dispatch attempts for this thread (excludes i_l2mru_tch_reqs) ++event:0x0000016884 counters:0 um:zero minimum:10000 name:PM_L2_RCLD_DISP_FAIL_ADDR : All I-od-D side load dispatch attempts for this thread that failed due to address collision with RC/CO/SN/SQ machine (excludes i_l2mru_tch_reqs) ++event:0x0000026084 counters:1 um:zero minimum:10000 name:PM_L2_RCLD_DISP_FAIL_OTHER : All I-or-D side load dispatch attempts for this thread that failed due to reason other than address collision (excludes i_l2mru_tch_reqs) ++event:0x0000036084 counters:2 um:zero minimum:10000 name:PM_L2_RCST_DISP : All D-side store dispatch attempts for this thread ++event:0x0000036884 counters:2 um:zero minimum:10000 name:PM_L2_RCST_DISP_FAIL_ADDR : All D-side store dispatch attempts for this thread that failed due to address collision with RC/CO/SN/SQ ++event:0x0000046084 counters:3 um:zero minimum:10000 name:PM_L2_RCST_DISP_FAIL_OTHER : All D-side store dispatch attempts for this thread that failed due to reason other than address collision ++event:0x0000036086 counters:2 um:zero minimum:10000 name:PM_L2_RC_ST_DONE : RC did store to line that was Tx or Sx ++event:0x000003688A counters:2 um:zero minimum:10000 name:PM_L2_RTY_LD : RC retries on PB for any load from core (excludes DCBFs) ++event:0x000003689E counters:2 um:zero minimum:10000 name:PM_L2_RTY_LD : RC retries on PB for any load from core (excludes DCBFs) ++event:0x000003608A counters:2 um:zero minimum:10000 name:PM_L2_RTY_ST : RC retries on PB for any store from core (excludes DCBFs) ++event:0x000004689E counters:3 um:zero minimum:10000 name:PM_L2_RTY_ST : RC retries on PB for any store from core (excludes DCBFs) ++event:0x0000046086 counters:3 um:zero minimum:10000 name:PM_L2_SN_M_RD_DONE : SNP dispatched for a read and was M (true M) ++event:0x0000016086 counters:0 um:zero minimum:10000 name:PM_L2_SN_M_WR_DONE : SNP dispatched for a write and was M (true M) ++event:0x0000046886 counters:3 um:zero minimum:10000 name:PM_L2_SN_M_WR_DONE : SNP dispatched for a write and was M (true M) ++event:0x0000036886 counters:2 um:zero minimum:10000 name:PM_L2_SN_SX_I_DONE : SNP dispatched and went from Sx to Ix ++event:0x0000016880 counters:0 um:zero minimum:10000 name:PM_L2_ST : All successful D-side store dispatches for this thread (L2 miss + L2 hits) ++event:0x000001689E counters:0 um:zero minimum:10000 name:PM_L2_ST_DISP : All successful D-side store dispatches for this thread (L2 miss + L2 hits) ++event:0x0000046082 counters:3 um:zero minimum:10000 name:PM_L2_ST_DISP : All successful D-side store dispatches for this thread ++event:0x000002689E counters:1 um:zero minimum:10000 name:PM_L2_ST_HIT : All successful D-side store dispatches that were L2 hits for this thread ++event:0x0000046882 counters:3 um:zero minimum:10000 name:PM_L2_ST_HIT : All successful D-side store dispatches for this thread that were L2 hits ++event:0x0000026880 counters:1 um:zero minimum:10000 name:PM_L2_ST_MISS : All successful D-Side Store dispatches that were an L2 miss for this thread ++event:0x0000016892 counters:0 um:zero minimum:10000 name:PM_L2_ST_MISS_128B : All successful D-side store dispatches that were an L2 miss (NOT Sx,Tx,Mx) for this thread and the RC calculated the request should be for 128B (i.e., M=0) ++event:0x0000026892 counters:1 um:zero minimum:10000 name:PM_L2_ST_MISS_64B : All successful D-side store dispatches that were an L2 miss (NOT Sx,Tx,Mx) for this thread and the RC calculated the request should be for 64B (i.e., M=1) ++event:0x0000036088 counters:2 um:zero minimum:10000 name:PM_L2_SYS_GUESS_CORRECT : L2 guess system (VGS or RNS) and guess was correct (ie data beyond-group) ++event:0x0000036888 counters:2 um:zero minimum:10000 name:PM_L2_SYS_GUESS_WRONG : L2 guess system (VGS or RNS) and guess was not correct (ie data ^beyond-group) ++event:0x000004688A counters:3 um:zero minimum:10000 name:PM_L2_SYS_PUMP : RC requests that were system pump attempts ++event:0x00000260A2 counters:1 um:zero minimum:10000 name:PM_L3_CI_HIT : L3 Castins Hit (total count) ++event:0x00000268A2 counters:1 um:zero minimum:10000 name:PM_L3_CI_MISS : L3 castins miss (total count) ++event:0x00000368A4 counters:2 um:zero minimum:10000 name:PM_L3_CINJ : L3 castin of cache inject ++event:0x00000168AC counters:0 um:zero minimum:10000 name:PM_L3_CI_USAGE : Rotating sample of 16 CI or CO actives ++event:0x00000360A8 counters:2 um:zero minimum:10000 name:PM_L3_CO : L3 castout occurring (does not include casthrough or log writes (cinj/dmaw)) ++event:0x00000368AC counters:2 um:zero minimum:10000 name:PM_L3_CO0_BUSY : Lifetime, sample of CO machine 0 valid ++event:0x00000468AC counters:3 um:zero minimum:10000 name:PM_L3_CO0_BUSY : Lifetime, sample of CO machine 0 valid ++event:0x00000268A0 counters:1 um:zero minimum:10000 name:PM_L3_CO_L31 : L3 CO to L3.1 OR of port 0 and 1 (lossy = may undercount if two cresps come in the same cyc) ++event:0x00000360A4 counters:2 um:zero minimum:10000 name:PM_L3_CO_LCO : Total L3 COs occurred on LCO L3.1 (good cresp, may end up in mem on a retry) ++event:0x00000260A0 counters:1 um:zero minimum:10000 name:PM_L3_CO_MEM : L3 CO to memory OR of port 0 and 1 (lossy = may undercount if two cresp come in the same cyc) ++event:0x00000168A0 counters:0 um:zero minimum:10000 name:PM_L3_CO_MEPF : L3 CO of line in Mep state (includes casthrough to memory). The Mepf state indicates that a line was brought in to satisfy an L3 prefetch request ++event:0x000003E05E counters:2 um:zero minimum:10000 name:PM_L3_CO_MEPF : L3 castouts in Mepf state for this thread ++event:0x00000168B2 counters:0 um:zero minimum:10000 name:PM_L3_GRP_GUESS_CORRECT : Initial scope=group (GS or NNS) and data from same group (near) (pred successful) ++event:0x00000368B2 counters:2 um:zero minimum:10000 name:PM_L3_GRP_GUESS_WRONG_HIGH : Initial scope=group (GS or NNS) but data from local node. Prediction too high ++event:0x00000360B2 counters:2 um:zero minimum:10000 name:PM_L3_GRP_GUESS_WRONG_LOW : Initial scope=group (GS or NNS) but data from outside group (far or rem). Prediction too Low ++event:0x00000160A4 counters:0 um:zero minimum:10000 name:PM_L3_HIT : L3 Hits (L2 miss hitting L3, including data/instrn/xlate) ++event:0x00000360A2 counters:2 um:zero minimum:10000 name:PM_L3_L2_CO_HIT : L2 CO hits ++event:0x00000368A2 counters:2 um:zero minimum:10000 name:PM_L3_L2_CO_MISS : L2 CO miss ++event:0x00000460A2 counters:3 um:zero minimum:10000 name:PM_L3_LAT_CI_HIT : L3 Lateral Castins Hit ++event:0x00000468A2 counters:3 um:zero minimum:10000 name:PM_L3_LAT_CI_MISS : L3 Lateral Castins Miss ++event:0x00000260A4 counters:1 um:zero minimum:10000 name:PM_L3_LD_HIT : L3 Hits for demand LDs ++event:0x00000268A4 counters:1 um:zero minimum:10000 name:PM_L3_LD_MISS : L3 Misses for demand LDs ++event:0x000000F0B0 counters:0,1,2,3 um:zero minimum:10000 name:PM_L3_LD_PREF : L3 load prefetch, sourced from a hardware or software stream, was sent to the nest ++event:0x00000160B2 counters:0 um:zero minimum:10000 name:PM_L3_LOC_GUESS_CORRECT : initial scope=node/chip (LNS) and data from local node (local) (pred successful) - always PFs only ++event:0x00000268B2 counters:1 um:zero minimum:10000 name:PM_L3_LOC_GUESS_WRONG : Initial scope=node (LNS) but data from out side local node (near or far or rem). Prediction too Low ++event:0x00000168A4 counters:0 um:zero minimum:10000 name:PM_L3_MISS : L3 Misses (L2 miss also missing L3, including data/instrn/xlate) ++event:0x00000460AA counters:3 um:zero minimum:10000 name:PM_L3_P0_CO_L31 : L3 CO to L3.1 (LCO) port 0 with or without data ++event:0x00000360AA counters:2 um:zero minimum:10000 name:PM_L3_P0_CO_MEM : L3 CO to memory port 0 with or without data ++event:0x00000360AE counters:2 um:zero minimum:10000 name:PM_L3_P0_CO_RTY : L3 CO received retry port 0 (memory only), every retry counted ++event:0x00000460AE counters:3 um:zero minimum:10000 name:PM_L3_P0_CO_RTY : L3 CO received retry port 2 (memory only), every retry counted ++event:0x00000260B0 counters:1 um:zero minimum:10000 name:PM_L3_P0_GRP_PUMP : L3 PF sent with grp scope port 0, counts even retried requests ++event:0x00000260AA counters:1 um:zero minimum:10000 name:PM_L3_P0_LCO_DATA : LCO sent with data port 0 ++event:0x00000160AA counters:0 um:zero minimum:10000 name:PM_L3_P0_LCO_NO_DATA : Dataless L3 LCO sent port 0 ++event:0x00000160B4 counters:0 um:zero minimum:10000 name:PM_L3_P0_LCO_RTY : L3 initiated LCO received retry on port 0 (can try 4 times) ++event:0x00000160B0 counters:0 um:zero minimum:10000 name:PM_L3_P0_NODE_PUMP : L3 PF sent with nodal scope port 0, counts even retried requests ++event:0x00000160AE counters:0 um:zero minimum:10000 name:PM_L3_P0_PF_RTY : L3 PF received retry port 0, every retry counted ++event:0x00000260AE counters:1 um:zero minimum:10000 name:PM_L3_P0_PF_RTY : L3 PF received retry port 2, every retry counted ++event:0x00000360B0 counters:2 um:zero minimum:10000 name:PM_L3_P0_SYS_PUMP : L3 PF sent with sys scope port 0, counts even retried requests ++event:0x00000468AA counters:3 um:zero minimum:10000 name:PM_L3_P1_CO_L31 : L3 CO to L3.1 (LCO) port 1 with or without data ++event:0x00000368AA counters:2 um:zero minimum:10000 name:PM_L3_P1_CO_MEM : L3 CO to memory port 1 with or without data ++event:0x00000368AE counters:2 um:zero minimum:10000 name:PM_L3_P1_CO_RTY : L3 CO received retry port 1 (memory only), every retry counted ++event:0x00000468AE counters:3 um:zero minimum:10000 name:PM_L3_P1_CO_RTY : L3 CO received retry port 3 (memory only), every retry counted ++event:0x00000268B0 counters:1 um:zero minimum:10000 name:PM_L3_P1_GRP_PUMP : L3 PF sent with grp scope port 1, counts even retried requests ++event:0x00000268AA counters:1 um:zero minimum:10000 name:PM_L3_P1_LCO_DATA : LCO sent with data port 1 ++event:0x00000168AA counters:0 um:zero minimum:10000 name:PM_L3_P1_LCO_NO_DATA : Dataless L3 LCO sent port 1 ++event:0x00000168B4 counters:0 um:zero minimum:10000 name:PM_L3_P1_LCO_RTY : L3 initiated LCO received retry on port 1 (can try 4 times) ++event:0x00000168B0 counters:0 um:zero minimum:10000 name:PM_L3_P1_NODE_PUMP : L3 PF sent with nodal scope port 1, counts even retried requests ++event:0x00000168AE counters:0 um:zero minimum:10000 name:PM_L3_P1_PF_RTY : L3 PF received retry port 1, every retry counted ++event:0x00000268AE counters:1 um:zero minimum:10000 name:PM_L3_P1_PF_RTY : L3 PF received retry port 3, every retry counted ++event:0x00000368B0 counters:2 um:zero minimum:10000 name:PM_L3_P1_SYS_PUMP : L3 PF sent with sys scope port 1, counts even retried requests ++event:0x00000260B4 counters:1 um:zero minimum:10000 name:PM_L3_P2_LCO_RTY : L3 initiated LCO received retry on port 2 (can try 4 times) ++event:0x00000268B4 counters:1 um:zero minimum:10000 name:PM_L3_P3_LCO_RTY : L3 initiated LCO received retry on port 3 (can try 4 times) ++event:0x00000360B4 counters:2 um:zero minimum:10000 name:PM_L3_PF0_BUSY : Lifetime, sample of PF machine 0 valid ++event:0x00000460B4 counters:3 um:zero minimum:10000 name:PM_L3_PF0_BUSY : Lifetime, sample of PF machine 0 valid ++event:0x00000260A8 counters:1 um:zero minimum:10000 name:PM_L3_PF_HIT_L3 : L3 PF hit in L3 (abandoned) ++event:0x00000160A0 counters:0 um:zero minimum:10000 name:PM_L3_PF_MISS_L3 : L3 PF missed in L3 ++event:0x00000368A0 counters:2 um:zero minimum:10000 name:PM_L3_PF_OFF_CHIP_CACHE : L3 PF from Off chip cache ++event:0x00000468A0 counters:3 um:zero minimum:10000 name:PM_L3_PF_OFF_CHIP_MEM : L3 PF from Off chip memory ++event:0x00000360A0 counters:2 um:zero minimum:10000 name:PM_L3_PF_ON_CHIP_CACHE : L3 PF from On chip cache ++event:0x00000460A0 counters:3 um:zero minimum:10000 name:PM_L3_PF_ON_CHIP_MEM : L3 PF from On chip memory ++event:0x00000260AC counters:1 um:zero minimum:10000 name:PM_L3_PF_USAGE : Rotating sample of 32 PF actives ++event:0x00000368B4 counters:2 um:zero minimum:10000 name:PM_L3_RD0_BUSY : Lifetime, sample of RD machine 0 valid ++event:0x00000468B4 counters:3 um:zero minimum:10000 name:PM_L3_RD0_BUSY : Lifetime, sample of RD machine 0 valid ++event:0x00000268AC counters:1 um:zero minimum:10000 name:PM_L3_RD_USAGE : Rotating sample of 16 RD actives ++event:0x00000360AC counters:2 um:zero minimum:10000 name:PM_L3_SN0_BUSY : Lifetime, sample of snooper machine 0 valid ++event:0x00000460AC counters:3 um:zero minimum:10000 name:PM_L3_SN0_BUSY : Lifetime, sample of snooper machine 0 valid ++event:0x00000160AC counters:0 um:zero minimum:10000 name:PM_L3_SN_USAGE : Rotating sample of 16 snoop valids ++event:0x000000F8B0 counters:0,1,2,3 um:zero minimum:10000 name:PM_L3_SW_PREF : L3 load prefetch, sourced from a software prefetch stream, was sent to the nest ++event:0x00000260B2 counters:1 um:zero minimum:10000 name:PM_L3_SYS_GUESS_CORRECT : Initial scope=system (VGS or RNS) and data from outside group (far or rem)(pred successful) ++event:0x00000460B2 counters:3 um:zero minimum:10000 name:PM_L3_SYS_GUESS_WRONG : Initial scope=system (VGS or RNS) but data from local or near. Prediction too high ++event:0x00000468A4 counters:3 um:zero minimum:10000 name:PM_L3_TRANS_PF : L3 Transient prefetch received from L2 ++event:0x00000160B6 counters:0 um:zero minimum:10000 name:PM_L3_WI0_BUSY : Rotating sample of 8 WI valid ++event:0x00000260B6 counters:1 um:zero minimum:10000 name:PM_L3_WI0_BUSY : Rotating sample of 8 WI valid (duplicate) ++event:0x00000168A8 counters:0 um:zero minimum:10000 name:PM_L3_WI_USAGE : Lifetime, sample of Write Inject machine 0 valid ++event:0x000003C058 counters:2 um:zero minimum:10000 name:PM_LARX_FIN : Larx finished ++event:0x000004003E counters:3 um:zero minimum:10000 name:PM_LD_CMPL : count of Loads completed ++event:0x0000010062 counters:0 um:zero minimum:10000 name:PM_LD_L3MISS_PEND_CYC : Cycles L3 miss was pending for this thread ++event:0x000003E054 counters:2 um:zero minimum:10000 name:PM_LD_MISS_L1 : Load Missed L1, counted at execution time (can be greater than loads finished). LMQ merges are not included in this count. i.e. if a load instruction misses on an address that is already allocated on the LMQ, this event will not increment for that load). Note that this count is per slice, so if a load spans multiple slices this event will increment multiple times for a single load. ++event:0x00000400F0 counters:3 um:zero minimum:10000 name:PM_LD_MISS_L1 : Load Missed L1, counted at execution time (can be greater than loads finished). LMQ merges are not included in this count. i.e. if a load instruction misses on an address that is already allocated on the LMQ, this event will not increment for that load). Note that this count is per slice, so if a load spans multiple slices this event will increment multiple times for a single load. ++event:0x000002C04E counters:1 um:zero minimum:10000 name:PM_LD_MISS_L1_FIN : Number of load instructions that finished with an L1 miss. Note that even if a load spans multiple slices this event will increment only once per load op. ++event:0x00000100FC counters:0 um:zero minimum:10000 name:PM_LD_REF_L1 : All L1 D cache load references counted at finish, gated by reject ++event:0x00000058A0 counters:0,1,2,3 um:zero minimum:10000 name:PM_LINK_STACK_CORRECT : Link stack predicts right address ++event:0x0000005898 counters:0,1,2,3 um:zero minimum:10000 name:PM_LINK_STACK_INVALID_PTR : It is most often caused by certain types of flush where the pointer is not available. Can result in the data in the link stack becoming unusable. ++event:0x0000005098 counters:0,1,2,3 um:zero minimum:10000 name:PM_LINK_STACK_WRONG_ADD_PRED : Link stack predicts wrong address, because of link stack design limitation or software violating the coding conventions ++event:0x000002E05E counters:1 um:zero minimum:10000 name:PM_LMQ_EMPTY_CYC : Cycles in which the LMQ has no pending load misses for this thread ++event:0x000001002E counters:0 um:zero minimum:10000 name:PM_LMQ_MERGE : A demand miss collides with a prefetch for the same line ++event:0x000002E05A counters:1 um:zero minimum:10000 name:PM_LRQ_REJECT : Internal LSU reject from LRQ. Rejects cause the load to go back to LRQ, but it stays contained within the LSU once it gets issued. This event counts the number of times the LRQ attempts to relaunch an instruction after a reject. Any load can suffer multiple rejects ++event:0x000000D090 counters:0,1,2,3 um:zero minimum:10000 name:PM_LS0_DC_COLLISIONS : Read-write data cache collisions ++event:0x000000E084 counters:0,1,2,3 um:zero minimum:10000 name:PM_LS0_ERAT_MISS_PREF : LS0 Erat miss due to prefetch ++event:0x000000C09C counters:0,1,2,3 um:zero minimum:10000 name:PM_LS0_LAUNCH_HELD_PREF : Number of times a load or store instruction was unable to launch/relaunch because a high priority prefetch used that relaunch cycle ++event:0x000000E0BC counters:0,1,2,3 um:zero minimum:10000 name:PM_LS0_PTE_TABLEWALK_CYC : Cycles when a tablewalk is pending on this thread on table 0 ++event:0x000000E0B4 counters:0,1,2,3 um:zero minimum:10000 name:PM_LS0_TM_DISALLOW : A TM-ineligible instruction tries to execute inside a transaction and the LSU disallows it ++event:0x000000C094 counters:0,1,2,3 um:zero minimum:10000 name:PM_LS0_UNALIGNED_LD : Load instructions whose data crosses a double-word boundary, which causes it to require an additional slice than than what normally would be required of the load of that size. If the load wraps from slice 3 to slice 0, thee is an additional 3-cycle penalty ++event:0x000000F0B8 counters:0,1,2,3 um:zero minimum:10000 name:PM_LS0_UNALIGNED_ST : Store instructions whose data crosses a double-word boundary, which causes it to require an additional slice than than what normally would be required of the Store of that size. If the Store wraps from slice 3 to slice 0, thee is an additional 3-cycle penalty ++event:0x000000D890 counters:0,1,2,3 um:zero minimum:10000 name:PM_LS1_DC_COLLISIONS : Read-write data cache collisions ++event:0x000000E884 counters:0,1,2,3 um:zero minimum:10000 name:PM_LS1_ERAT_MISS_PREF : LS1 Erat miss due to prefetch ++event:0x000000C89C counters:0,1,2,3 um:zero minimum:10000 name:PM_LS1_LAUNCH_HELD_PREF : Number of times a load or store instruction was unable to launch/relaunch because a high priority prefetch used that relaunch cycle ++event:0x000000E8BC counters:0,1,2,3 um:zero minimum:10000 name:PM_LS1_PTE_TABLEWALK_CYC : Cycles when a tablewalk is pending on this thread on table 1 ++event:0x000000E8B4 counters:0,1,2,3 um:zero minimum:10000 name:PM_LS1_TM_DISALLOW : A TM-ineligible instruction tries to execute inside a transaction and the LSU disallows it ++event:0x000000C894 counters:0,1,2,3 um:zero minimum:10000 name:PM_LS1_UNALIGNED_LD : Load instructions whose data crosses a double-word boundary, which causes it to require an additional slice than than what normally would be required of the load of that size. If the load wraps from slice 3 to slice 0, thee is an additional 3-cycle penalty ++event:0x000000F8B8 counters:0,1,2,3 um:zero minimum:10000 name:PM_LS1_UNALIGNED_ST : Store instructions whose data crosses a double-word boundary, which causes it to require an additional slice than than what normally would be required of the Store of that size. If the Store wraps from slice 3 to slice 0, thee is an additional 3-cycle penalty ++event:0x000000D094 counters:0,1,2,3 um:zero minimum:10000 name:PM_LS2_DC_COLLISIONS : Read-write data cache collisions ++event:0x000000E088 counters:0,1,2,3 um:zero minimum:10000 name:PM_LS2_ERAT_MISS_PREF : LS0 Erat miss due to prefetch ++event:0x000000E0B8 counters:0,1,2,3 um:zero minimum:10000 name:PM_LS2_TM_DISALLOW : A TM-ineligible instruction tries to execute inside a transaction and the LSU disallows it ++event:0x000000C098 counters:0,1,2,3 um:zero minimum:10000 name:PM_LS2_UNALIGNED_LD : Load instructions whose data crosses a double-word boundary, which causes it to require an additional slice than than what normally would be required of the load of that size. If the load wraps from slice 3 to slice 0, thee is an additional 3-cycle penalty ++event:0x000000F0BC counters:0,1,2,3 um:zero minimum:10000 name:PM_LS2_UNALIGNED_ST : Store instructions whose data crosses a double-word boundary, which causes it to require an additional slice than than what normally would be required of the Store of that size. If the Store wraps from slice 3 to slice 0, thee is an additional 3-cycle penalty ++event:0x000000D894 counters:0,1,2,3 um:zero minimum:10000 name:PM_LS3_DC_COLLISIONS : Read-write data cache collisions ++event:0x000000E888 counters:0,1,2,3 um:zero minimum:10000 name:PM_LS3_ERAT_MISS_PREF : LS1 Erat miss due to prefetch ++event:0x000000E8B8 counters:0,1,2,3 um:zero minimum:10000 name:PM_LS3_TM_DISALLOW : A TM-ineligible instruction tries to execute inside a transaction and the LSU disallows it ++event:0x000000C898 counters:0,1,2,3 um:zero minimum:10000 name:PM_LS3_UNALIGNED_LD : Load instructions whose data crosses a double-word boundary, which causes it to require an additional slice than than what normally would be required of the load of that size. If the load wraps from slice 3 to slice 0, thee is an additional 3-cycle penalty ++event:0x000000F8BC counters:0,1,2,3 um:zero minimum:10000 name:PM_LS3_UNALIGNED_ST : Store instructions whose data crosses a double-word boundary, which causes it to require an additional slice than than what normally would be required of the Store of that size. If the Store wraps from slice 3 to slice 0, thee is an additional 3-cycle penalty ++event:0x000000D0BC counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU0_1_LRQF_FULL_CYC : Counts the number of cycles the LRQF is full. LRQF is the queue that holds loads between finish and completion. If it fills up, instructions stay in LRQ until completion, potentially backing up the LRQ ++event:0x000000E08C counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU0_ERAT_HIT : Primary ERAT hit. There is no secondary ERAT ++event:0x000000C0A0 counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU0_FALSE_LHS : False LHS match detected ++event:0x000000F090 counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU0_L1_CAM_CANCEL : ls0 l1 tm cam cancel ++event:0x000000D088 counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU0_LDMX_FIN : New P9 instruction LDMX. The definition of this new PMU event is (from the ldmx RFC02491): "The thread has executed an ldmx instruction that accessed a doubleword that contains an effective address within an enabled section of the Load Monitored region." This event, therefore, should not occur if the FSCR has disabled the load monitored facility (FSCR[52]) or disabled the EBB facility (FSCR[56]). ++event:0x000000D8B8 counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU0_LMQ_S0_VALID : Slot 0 of LMQ valid ++event:0x000000D8B4 counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU0_LRQ_S0_VALID_CYC : Slot 0 of LRQ valid ++event:0x000000D080 counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU0_SET_MPRED : Set prediction(set-p) miss. The entry was not found in the Set prediction table ++event:0x000000D0B4 counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU0_SRQ_S0_VALID_CYC : Slot 0 of SRQ valid ++event:0x000000F088 counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU0_STORE_REJECT : All internal store rejects cause the instruction to go back to the SRQ and go to sleep until woken up to try again after the condition has been met ++event:0x000000E094 counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU0_TM_L1_HIT : Load tm hit in L1 ++event:0x000000E09C counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU0_TM_L1_MISS : Load tm L1 miss ++event:0x000000E88C counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU1_ERAT_HIT : Primary ERAT hit. There is no secondary ERAT ++event:0x000000C8A0 counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU1_FALSE_LHS : False LHS match detected ++event:0x000000F890 counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU1_L1_CAM_CANCEL : ls1 l1 tm cam cancel ++event:0x000000D888 counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU1_LDMX_FIN : New P9 instruction LDMX. The definition of this new PMU event is (from the ldmx RFC02491): "The thread has executed an ldmx instruction that accessed a doubleword that contains an effective address within an enabled section of the Load Monitored region." This event, therefore, should not occur if the FSCR has disabled the load monitored facility (FSCR[52]) or disabled the EBB facility (FSCR[56]). ++event:0x000000D880 counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU1_SET_MPRED : Set prediction(set-p) miss. The entry was not found in the Set prediction table ++event:0x000000F888 counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU1_STORE_REJECT : All internal store rejects cause the instruction to go back to the SRQ and go to sleep until woken up to try again after the condition has been met ++event:0x000000E894 counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU1_TM_L1_HIT : Load tm hit in L1 ++event:0x000000E89C counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU1_TM_L1_MISS : Load tm L1 miss ++event:0x000000D8BC counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU2_3_LRQF_FULL_CYC : Counts the number of cycles the LRQF is full. LRQF is the queue that holds loads between finish and completion. If it fills up, instructions stay in LRQ until completion, potentially backing up the LRQ ++event:0x000000E090 counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU2_ERAT_HIT : Primary ERAT hit. There is no secondary ERAT ++event:0x000000C0A4 counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU2_FALSE_LHS : False LHS match detected ++event:0x000000F094 counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU2_L1_CAM_CANCEL : ls2 l1 tm cam cancel ++event:0x000000D08C counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU2_LDMX_FIN : New P9 instruction LDMX. The definition of this new PMU event is (from the ldmx RFC02491): "The thread has executed an ldmx instruction that accessed a doubleword that contains an effective address within an enabled section of the Load Monitored region." This event, therefore, should not occur if the FSCR has disabled the load monitored facility (FSCR[52]) or disabled the EBB facility (FSCR[56]). ++event:0x000000D084 counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU2_SET_MPRED : Set prediction(set-p) miss. The entry was not found in the Set prediction table ++event:0x000000F08C counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU2_STORE_REJECT : All internal store rejects cause the instruction to go back to the SRQ and go to sleep until woken up to try again after the condition has been met ++event:0x000000E098 counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU2_TM_L1_HIT : Load tm hit in L1 ++event:0x000000E0A0 counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU2_TM_L1_MISS : Load tm L1 miss ++event:0x000000E890 counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU3_ERAT_HIT : Primary ERAT hit. There is no secondary ERAT ++event:0x000000C8A4 counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU3_FALSE_LHS : False LHS match detected ++event:0x000000F894 counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU3_L1_CAM_CANCEL : ls3 l1 tm cam cancel ++event:0x000000D88C counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU3_LDMX_FIN : New P9 instruction LDMX. The definition of this new PMU event is (from the ldmx RFC02491): "The thread has executed an ldmx instruction that accessed a doubleword that contains an effective address within an enabled section of the Load Monitored region." This event, therefore, should not occur if the FSCR has disabled the load monitored facility (FSCR[52]) or disabled the EBB facility (FSCR[56]). ++event:0x000000D884 counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU3_SET_MPRED : Set prediction(set-p) miss. The entry was not found in the Set prediction table ++event:0x000000F88C counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU3_STORE_REJECT : All internal store rejects cause the instruction to go back to the SRQ and go to sleep until woken up to try again after the condition has been met ++event:0x000000E898 counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU3_TM_L1_HIT : Load tm hit in L1 ++event:0x000000E8A0 counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU3_TM_L1_MISS : Load tm L1 miss ++event:0x00000200F6 counters:1 um:zero minimum:10000 name:PM_LSU_DERAT_MISS : DERAT Reloaded due to a DERAT miss ++event:0x0000030066 counters:2 um:zero minimum:10000 name:PM_LSU_FIN : LSU Finished a PPC instruction (up to 4 per cycle) ++event:0x000000C8A8 counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU_FLUSH_ATOMIC : Quad-word loads (lq) are considered atomic because they always span at least 2 slices. If a snoop or store from another thread changes the data the load is accessing between the 2 or 3 pieces of the lq instruction, the lq will be flushed ++event:0x000000C0A8 counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU_FLUSH_CI : Load was not issued to LSU as a cache inhibited (non-cacheable) load but it was later determined to be cache inhibited ++event:0x000000C0AC counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU_FLUSH_EMSH : An ERAT miss was detected after a set-p hit. Erat tracker indicates fail due to tlbmiss and the instruction gets flushed because the instruction was working on the wrong address ++event:0x000000C8B8 counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU_FLUSH_LARX_STCX : A larx is flushed because an older larx has an LMQ reservation for the same thread. A stcx is flushed because an older stcx is in the LMQ. The flush happens when the older larx/stcx relaunches ++event:0x000000C8B4 counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU_FLUSH_LHL_SHL : The instruction was flushed because of a sequential load/store consistency. If a load or store hits on an older load that has either been snooped (for loads) or has stale data (for stores). ++event:0x000000C8B0 counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU_FLUSH_LHS : Effective Address alias flush : no EA match but Real Address match. If the data has not yet been returned for this load, the instruction will just be rejected, but if it has returned data, it will be flushed ++event:0x00000020B0 counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU_FLUSH_NEXT : LSU flush next reported at flush time. Sometimes these also come with an exception ++event:0x000000C0BC counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU_FLUSH_OTHER : Other LSU flushes including: Sync (sync ack from L2 caused search of LRQ for oldest snooped load, This will either signal a Precise Flush of the oldest snooped loa or a Flush Next PPC) ++event:0x000000C8AC counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU_FLUSH_RELAUNCH_MISS : If a load that has already returned data and has to relaunch for any reason then gets a miss (erat, setp, data cache), it will often be flushed at relaunch time because the data might be inconsistent ++event:0x000000C0B8 counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU_FLUSH_SAO : A load-hit-load condition with Strong Address Ordering will have address compare disabled and flush ++event:0x000000C0B0 counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU_FLUSH_UE : Correctable ECC error on reload data, reported at critical data forward time ++event:0x000000C0B4 counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU_FLUSH_WRK_ARND : LSU workaround flush. These flushes are setup with programmable scan only latches to perform various actions when the flush macro receives a trigger from the dbg macros. These actions include things like flushing the next op encountered for a particular thread or flushing the next op that is NTC op that is encountered on a particular slice. The kind of flush that the workaround is setup to perform is highly variable. ++event:0x000000D0B8 counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU_LMQ_FULL_CYC : Counts the number of cycles the LMQ is full ++event:0x000002003E counters:1 um:zero minimum:10000 name:PM_LSU_LMQ_SRQ_EMPTY_CYC : Cycles in which the LSU is empty for all threads (lmq and srq are completely empty) ++event:0x000000C890 counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU_NCST : Asserts when a i=1 store op is sent to the nest. No record of issue pipe (LS0/LS1) is maintained so this is for both pipes. Probably don't need separate LS0 and LS1 ++event:0x000002E05C counters:1 um:zero minimum:10000 name:PM_LSU_REJECT_ERAT_MISS : LSU Reject due to ERAT (up to 4 per cycles) ++event:0x000004E05C counters:3 um:zero minimum:10000 name:PM_LSU_REJECT_LHS : LSU Reject due to LHS (up to 4 per cycle) ++event:0x000003001C counters:2 um:zero minimum:10000 name:PM_LSU_REJECT_LMQ_FULL : LSU Reject due to LMQ full (up to 4 per cycles) ++event:0x000001001A counters:0 um:zero minimum:10000 name:PM_LSU_SRQ_FULL_CYC : Cycles in which the Store Queue is full on all 4 slices. This is event is not per thread. All the threads will see the same count for this core resource ++event:0x000000C090 counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU_STCX : STCX sent to nest, i.e. total ++event:0x000000F080 counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU_STCX_FAIL : LSU_STCX_FAIL ++event:0x0000005894 counters:0,1,2,3 um:zero minimum:10000 name:PM_LWSYNC : Lwsync instruction decoded and transferred ++event:0x000004505C counters:3 um:zero minimum:10000 name:PM_MATH_FLOP_CMPL : Math flop instruction completed ++event:0x000004C058 counters:3 um:zero minimum:10000 name:PM_MEM_CO : Memory castouts from this thread ++event:0x0000010058 counters:0 um:zero minimum:10000 name:PM_MEM_LOC_THRESH_IFU : Local Memory above threshold for IFU speculation control ++event:0x0000040056 counters:3 um:zero minimum:10000 name:PM_MEM_LOC_THRESH_LSU_HIGH : Local memory above threshold for LSU medium ++event:0x000001C05E counters:0 um:zero minimum:10000 name:PM_MEM_LOC_THRESH_LSU_MED : Local memory above threshold for data prefetch ++event:0x000002C058 counters:1 um:zero minimum:10000 name:PM_MEM_PREF : Memory prefetch for this thread. Includes L4 ++event:0x0000010056 counters:0 um:zero minimum:10000 name:PM_MEM_READ : Reads from Memory from this thread (includes data/inst/xlate/l1prefetch/inst prefetch). Includes L4 ++event:0x000003C05E counters:2 um:zero minimum:10000 name:PM_MEM_RWITM : Memory Read With Intent to Modify for this thread ++event:0x000003515E counters:2 um:zero minimum:100 name:PM_MRK_BACK_BR_CMPL : Marked branch instruction completed with a target address less than current instruction address ++event:0x0000010138 counters:0 um:zero minimum:100 name:PM_MRK_BR_2PATH : marked branches which are not strongly biased ++event:0x000001016E counters:0 um:zero minimum:100 name:PM_MRK_BR_CMPL : Branch Instruction completed ++event:0x00000301E4 counters:2 um:zero minimum:100 name:PM_MRK_BR_MPRED_CMPL : Marked Branch Mispredicted ++event:0x00000101E2 counters:0 um:zero minimum:100 name:PM_MRK_BR_TAKEN_CMPL : Marked Branch Taken completed ++event:0x000002013A counters:1 um:zero minimum:100 name:PM_MRK_BRU_FIN : bru marked instr finish ++event:0x000003D14E counters:2 um:zero minimum:100 name:PM_MRK_DATA_FROM_DL2L3_MOD : The processor's data cache was reloaded with Modified (M) data from another chip's L2 or L3 on a different Node or Group (Distant), as this chip due to a marked load ++event:0x000004D12E counters:3 um:zero minimum:100 name:PM_MRK_DATA_FROM_DL2L3_MOD_CYC : Duration in cycles to reload with Modified (M) data from another chip's L2 or L3 on a different Node or Group (Distant), as this chip due to a marked load ++event:0x000001D150 counters:0 um:zero minimum:100 name:PM_MRK_DATA_FROM_DL2L3_SHR : The processor's data cache was reloaded with Shared (S) data from another chip's L2 or L3 on a different Node or Group (Distant), as this chip due to a marked load ++event:0x000002C128 counters:1 um:zero minimum:100 name:PM_MRK_DATA_FROM_DL2L3_SHR_CYC : Duration in cycles to reload with Shared (S) data from another chip's L2 or L3 on a different Node or Group (Distant), as this chip due to a marked load ++event:0x000001D152 counters:0 um:zero minimum:100 name:PM_MRK_DATA_FROM_DL4 : The processor's data cache was reloaded from another chip's L4 on a different Node or Group (Distant) due to a marked load ++event:0x000002C12C counters:1 um:zero minimum:100 name:PM_MRK_DATA_FROM_DL4_CYC : Duration in cycles to reload from another chip's L4 on a different Node or Group (Distant) due to a marked load ++event:0x000003D14C counters:2 um:zero minimum:100 name:PM_MRK_DATA_FROM_DMEM : The processor's data cache was reloaded from another chip's memory on the same Node or Group (Distant) due to a marked load ++event:0x000004E11E counters:3 um:zero minimum:100 name:PM_MRK_DATA_FROM_DMEM_CYC : Duration in cycles to reload from another chip's memory on the same Node or Group (Distant) due to a marked load ++event:0x000002C126 counters:1 um:zero minimum:100 name:PM_MRK_DATA_FROM_L2 : The processor's data cache was reloaded from local core's L2 due to a marked load ++event:0x000004D146 counters:3 um:zero minimum:100 name:PM_MRK_DATA_FROM_L21_MOD : The processor's data cache was reloaded with Modified (M) data from another core's L2 on the same chip due to a marked load ++event:0x000003D148 counters:2 um:zero minimum:100 name:PM_MRK_DATA_FROM_L21_MOD_CYC : Duration in cycles to reload with Modified (M) data from another core's L2 on the same chip due to a marked load ++event:0x000002D14E counters:1 um:zero minimum:100 name:PM_MRK_DATA_FROM_L21_SHR : The processor's data cache was reloaded with Shared (S) data from another core's L2 on the same chip due to a marked load ++event:0x000001D154 counters:0 um:zero minimum:100 name:PM_MRK_DATA_FROM_L21_SHR_CYC : Duration in cycles to reload with Shared (S) data from another core's L2 on the same chip due to a marked load ++event:0x0000014156 counters:0 um:zero minimum:100 name:PM_MRK_DATA_FROM_L2_CYC : Duration in cycles to reload from local core's L2 due to a marked load ++event:0x000002D148 counters:1 um:zero minimum:100 name:PM_MRK_DATA_FROM_L2_DISP_CONFLICT_LDHITST : The processor's data cache was reloaded from local core's L2 with load hit store conflict due to a marked load ++event:0x000001415A counters:0 um:zero minimum:100 name:PM_MRK_DATA_FROM_L2_DISP_CONFLICT_LDHITST_CYC : Duration in cycles to reload from local core's L2 with load hit store conflict due to a marked load ++event:0x000002C124 counters:1 um:zero minimum:100 name:PM_MRK_DATA_FROM_L2_DISP_CONFLICT_OTHER : The processor's data cache was reloaded from local core's L2 with dispatch conflict due to a marked load ++event:0x000003D140 counters:2 um:zero minimum:100 name:PM_MRK_DATA_FROM_L2_DISP_CONFLICT_OTHER_CYC : Duration in cycles to reload from local core's L2 with dispatch conflict due to a marked load ++event:0x000004C120 counters:3 um:zero minimum:100 name:PM_MRK_DATA_FROM_L2_MEPF : The processor's data cache was reloaded from local core's L2 hit without dispatch conflicts on Mepf state. due to a marked load ++event:0x000003D144 counters:2 um:zero minimum:100 name:PM_MRK_DATA_FROM_L2_MEPF_CYC : Duration in cycles to reload from local core's L2 hit without dispatch conflicts on Mepf state. due to a marked load ++event:0x00000401E8 counters:3 um:zero minimum:100 name:PM_MRK_DATA_FROM_L2MISS : The processor's data cache was reloaded from a location other than the local core's L2 due to a marked load ++event:0x0000035152 counters:2 um:zero minimum:100 name:PM_MRK_DATA_FROM_L2MISS_CYC : Duration in cycles to reload from a location other than the local core's L2 due to a marked load ++event:0x000002C120 counters:1 um:zero minimum:100 name:PM_MRK_DATA_FROM_L2_NO_CONFLICT : The processor's data cache was reloaded from local core's L2 without conflict due to a marked load ++event:0x0000014158 counters:0 um:zero minimum:100 name:PM_MRK_DATA_FROM_L2_NO_CONFLICT_CYC : Duration in cycles to reload from local core's L2 without conflict due to a marked load ++event:0x000004D142 counters:3 um:zero minimum:100 name:PM_MRK_DATA_FROM_L3 : The processor's data cache was reloaded from local core's L3 due to a marked load ++event:0x000004D144 counters:3 um:zero minimum:100 name:PM_MRK_DATA_FROM_L31_ECO_MOD : The processor's data cache was reloaded with Modified (M) data from another core's ECO L3 on the same chip due to a marked load ++event:0x0000035158 counters:2 um:zero minimum:100 name:PM_MRK_DATA_FROM_L31_ECO_MOD_CYC : Duration in cycles to reload with Modified (M) data from another core's ECO L3 on the same chip due to a marked load ++event:0x000002D14C counters:1 um:zero minimum:100 name:PM_MRK_DATA_FROM_L31_ECO_SHR : The processor's data cache was reloaded with Shared (S) data from another core's ECO L3 on the same chip due to a marked load ++event:0x000001D142 counters:0 um:zero minimum:100 name:PM_MRK_DATA_FROM_L31_ECO_SHR_CYC : Duration in cycles to reload with Shared (S) data from another core's ECO L3 on the same chip due to a marked load ++event:0x000002D144 counters:1 um:zero minimum:100 name:PM_MRK_DATA_FROM_L31_MOD : The processor's data cache was reloaded with Modified (M) data from another core's L3 on the same chip due to a marked load ++event:0x000001D140 counters:0 um:zero minimum:100 name:PM_MRK_DATA_FROM_L31_MOD_CYC : Duration in cycles to reload with Modified (M) data from another core's L3 on the same chip due to a marked load ++event:0x000004D124 counters:3 um:zero minimum:100 name:PM_MRK_DATA_FROM_L31_SHR : The processor's data cache was reloaded with Shared (S) data from another core's L3 on the same chip due to a marked load ++event:0x0000035156 counters:2 um:zero minimum:100 name:PM_MRK_DATA_FROM_L31_SHR_CYC : Duration in cycles to reload with Shared (S) data from another core's L3 on the same chip due to a marked load ++event:0x0000035154 counters:2 um:zero minimum:100 name:PM_MRK_DATA_FROM_L3_CYC : Duration in cycles to reload from local core's L3 due to a marked load ++event:0x000001D144 counters:0 um:zero minimum:100 name:PM_MRK_DATA_FROM_L3_DISP_CONFLICT : The processor's data cache was reloaded from local core's L3 with dispatch conflict due to a marked load ++event:0x000002C122 counters:1 um:zero minimum:100 name:PM_MRK_DATA_FROM_L3_DISP_CONFLICT_CYC : Duration in cycles to reload from local core's L3 with dispatch conflict due to a marked load ++event:0x000002D142 counters:1 um:zero minimum:100 name:PM_MRK_DATA_FROM_L3_MEPF : The processor's data cache was reloaded from local core's L3 without dispatch conflicts hit on Mepf state. due to a marked load ++event:0x000001415C counters:0 um:zero minimum:100 name:PM_MRK_DATA_FROM_L3_MEPF_CYC : Duration in cycles to reload from local core's L3 without dispatch conflicts hit on Mepf state due to a marked load ++event:0x00000201E4 counters:1 um:zero minimum:100 name:PM_MRK_DATA_FROM_L3MISS : The processor's data cache was reloaded from a location other than the local core's L3 due to a marked load ++event:0x000001415E counters:0 um:zero minimum:100 name:PM_MRK_DATA_FROM_L3MISS_CYC : Duration in cycles to reload from a location other than the local core's L3 due to a marked load ++event:0x000003D146 counters:2 um:zero minimum:100 name:PM_MRK_DATA_FROM_L3_NO_CONFLICT : The processor's data cache was reloaded from local core's L3 without conflict due to a marked load ++event:0x000004C124 counters:3 um:zero minimum:100 name:PM_MRK_DATA_FROM_L3_NO_CONFLICT_CYC : Duration in cycles to reload from local core's L3 without conflict due to a marked load ++event:0x000001D14C counters:0 um:zero minimum:100 name:PM_MRK_DATA_FROM_LL4 : The processor's data cache was reloaded from the local chip's L4 cache due to a marked load ++event:0x000002C12E counters:1 um:zero minimum:100 name:PM_MRK_DATA_FROM_LL4_CYC : Duration in cycles to reload from the local chip's L4 cache due to a marked load ++event:0x000003D142 counters:2 um:zero minimum:100 name:PM_MRK_DATA_FROM_LMEM : The processor's data cache was reloaded from the local chip's Memory due to a marked load ++event:0x000004D128 counters:3 um:zero minimum:100 name:PM_MRK_DATA_FROM_LMEM_CYC : Duration in cycles to reload from the local chip's Memory due to a marked load ++event:0x00000201E0 counters:1 um:zero minimum:100 name:PM_MRK_DATA_FROM_MEMORY : The processor's data cache was reloaded from a memory location including L4 from local remote or distant due to a marked load ++event:0x000001D146 counters:0 um:zero minimum:100 name:PM_MRK_DATA_FROM_MEMORY_CYC : Duration in cycles to reload from a memory location including L4 from local remote or distant due to a marked load ++event:0x000002D120 counters:1 um:zero minimum:100 name:PM_MRK_DATA_FROM_OFF_CHIP_CACHE : The processor's data cache was reloaded either shared or modified data from another core's L2/L3 on a different chip (remote or distant) due to a marked load ++event:0x000001D14E counters:0 um:zero minimum:100 name:PM_MRK_DATA_FROM_OFF_CHIP_CACHE_CYC : Duration in cycles to reload either shared or modified data from another core's L2/L3 on a different chip (remote or distant) due to a marked load ++event:0x000004D140 counters:3 um:zero minimum:100 name:PM_MRK_DATA_FROM_ON_CHIP_CACHE : The processor's data cache was reloaded either shared or modified data from another core's L2/L3 on the same chip due to a marked load ++event:0x000003515A counters:2 um:zero minimum:100 name:PM_MRK_DATA_FROM_ON_CHIP_CACHE_CYC : Duration in cycles to reload either shared or modified data from another core's L2/L3 on the same chip due to a marked load ++event:0x000001D14A counters:0 um:zero minimum:100 name:PM_MRK_DATA_FROM_RL2L3_MOD : The processor's data cache was reloaded with Modified (M) data from another chip's L2 or L3 on the same Node or Group (Remote), as this chip due to a marked load ++event:0x000002D14A counters:1 um:zero minimum:100 name:PM_MRK_DATA_FROM_RL2L3_MOD_CYC : Duration in cycles to reload with Modified (M) data from another chip's L2 or L3 on the same Node or Group (Remote), as this chip due to a marked load ++event:0x0000035150 counters:2 um:zero minimum:100 name:PM_MRK_DATA_FROM_RL2L3_SHR : The processor's data cache was reloaded with Shared (S) data from another chip's L2 or L3 on the same Node or Group (Remote), as this chip due to a marked load ++event:0x000004C12A counters:3 um:zero minimum:100 name:PM_MRK_DATA_FROM_RL2L3_SHR_CYC : Duration in cycles to reload with Shared (S) data from another chip's L2 or L3 on the same Node or Group (Remote), as this chip due to a marked load ++event:0x000003515C counters:2 um:zero minimum:100 name:PM_MRK_DATA_FROM_RL4 : The processor's data cache was reloaded from another chip's L4 on the same Node or Group ( Remote) due to a marked load ++event:0x000004D12A counters:3 um:zero minimum:100 name:PM_MRK_DATA_FROM_RL4_CYC : Duration in cycles to reload from another chip's L4 on the same Node or Group ( Remote) due to a marked load ++event:0x000001D148 counters:0 um:zero minimum:100 name:PM_MRK_DATA_FROM_RMEM : The processor's data cache was reloaded from another chip's memory on the same Node or Group ( Remote) due to a marked load ++event:0x000002C12A counters:1 um:zero minimum:100 name:PM_MRK_DATA_FROM_RMEM_CYC : Duration in cycles to reload from another chip's memory on the same Node or Group ( Remote) due to a marked load ++event:0x0000040118 counters:3 um:zero minimum:100 name:PM_MRK_DCACHE_RELOAD_INTV : Combined Intervention event ++event:0x00000301E6 counters:2 um:zero minimum:100 name:PM_MRK_DERAT_MISS : Erat Miss (TLB Access) All page sizes ++event:0x000004C15C counters:3 um:zero minimum:100 name:PM_MRK_DERAT_MISS_16G : Marked Data ERAT Miss (Data TLB Access) page size 16G ++event:0x000003D154 counters:2 um:zero minimum:100 name:PM_MRK_DERAT_MISS_16M : Marked Data ERAT Miss (Data TLB Access) page size 16M ++event:0x000003D152 counters:2 um:zero minimum:100 name:PM_MRK_DERAT_MISS_1G : Marked Data ERAT Miss (Data TLB Access) page size 1G. Implies radix translation ++event:0x000002D152 counters:1 um:zero minimum:100 name:PM_MRK_DERAT_MISS_2M : Marked Data ERAT Miss (Data TLB Access) page size 2M. Implies radix translation ++event:0x000002D150 counters:1 um:zero minimum:100 name:PM_MRK_DERAT_MISS_4K : Marked Data ERAT Miss (Data TLB Access) page size 4K ++event:0x000002D154 counters:1 um:zero minimum:100 name:PM_MRK_DERAT_MISS_64K : Marked Data ERAT Miss (Data TLB Access) page size 64K ++event:0x0000020132 counters:1 um:zero minimum:100 name:PM_MRK_DFU_FIN : Decimal Unit marked Instruction Finish ++event:0x000004F148 counters:3 um:zero minimum:100 name:PM_MRK_DPTEG_FROM_DL2L3_MOD : A Page Table Entry was loaded into the TLB with Modified (M) data from another chip's L2 or L3 on a different Node or Group (Distant), as this chip due to a marked data side request. When using Radix Page Translation, this count excludes PDE reloads. Only PTE reloads are included ++event:0x000003F148 counters:2 um:zero minimum:100 name:PM_MRK_DPTEG_FROM_DL2L3_SHR : A Page Table Entry was loaded into the TLB with Shared (S) data from another chip's L2 or L3 on a different Node or Group (Distant), as this chip due to a marked data side request. When using Radix Page Translation, this count excludes PDE reloads. Only PTE reloads are included ++event:0x000003F14C counters:2 um:zero minimum:100 name:PM_MRK_DPTEG_FROM_DL4 : A Page Table Entry was loaded into the TLB from another chip's L4 on a different Node or Group (Distant) due to a marked data side request. When using Radix Page Translation, this count excludes PDE reloads. Only PTE reloads are included ++event:0x000004F14C counters:3 um:zero minimum:100 name:PM_MRK_DPTEG_FROM_DMEM : A Page Table Entry was loaded into the TLB from another chip's memory on the same Node or Group (Distant) due to a marked data side request. When using Radix Page Translation, this count excludes PDE reloads. Only PTE reloads are included ++event:0x000001F142 counters:0 um:zero minimum:100 name:PM_MRK_DPTEG_FROM_L2 : A Page Table Entry was loaded into the TLB from local core's L2 due to a marked data side request. When using Radix Page Translation, this count excludes PDE reloads. Only PTE reloads are included ++event:0x000004F146 counters:3 um:zero minimum:100 name:PM_MRK_DPTEG_FROM_L21_MOD : A Page Table Entry was loaded into the TLB with Modified (M) data from another core's L2 on the same chip due to a marked data side request. When using Radix Page Translation, this count excludes PDE reloads. Only PTE reloads are included ++event:0x000003F146 counters:2 um:zero minimum:100 name:PM_MRK_DPTEG_FROM_L21_SHR : A Page Table Entry was loaded into the TLB with Shared (S) data from another core's L2 on the same chip due to a marked data side request. When using Radix Page Translation, this count excludes PDE reloads. Only PTE reloads are included ++event:0x000002F140 counters:1 um:zero minimum:100 name:PM_MRK_DPTEG_FROM_L2_MEPF : A Page Table Entry was loaded into the TLB from local core's L2 hit without dispatch conflicts on Mepf state. due to a marked data side request. When using Radix Page Translation, this count excludes PDE reloads. Only PTE reloads are included ++event:0x000001F14E counters:0 um:zero minimum:100 name:PM_MRK_DPTEG_FROM_L2MISS : A Page Table Entry was loaded into the TLB from a location other than the local core's L2 due to a marked data side request.. When using Radix Page Translation, this count excludes PDE reloads. Only PTE reloads are included ++event:0x000001F140 counters:0 um:zero minimum:100 name:PM_MRK_DPTEG_FROM_L2_NO_CONFLICT : A Page Table Entry was loaded into the TLB from local core's L2 without conflict due to a marked data side request. When using Radix Page Translation, this count excludes PDE reloads. Only PTE reloads are included ++event:0x000004F142 counters:3 um:zero minimum:100 name:PM_MRK_DPTEG_FROM_L3 : A Page Table Entry was loaded into the TLB from local core's L3 due to a marked data side request. When using Radix Page Translation, this count excludes PDE reloads. Only PTE reloads are included ++event:0x000004F144 counters:3 um:zero minimum:100 name:PM_MRK_DPTEG_FROM_L31_ECO_MOD : A Page Table Entry was loaded into the TLB with Modified (M) data from another core's ECO L3 on the same chip due to a marked data side request. When using Radix Page Translation, this count excludes PDE reloads. Only PTE reloads are included ++event:0x000003F144 counters:2 um:zero minimum:100 name:PM_MRK_DPTEG_FROM_L31_ECO_SHR : A Page Table Entry was loaded into the TLB with Shared (S) data from another core's ECO L3 on the same chip due to a marked data side request. When using Radix Page Translation, this count excludes PDE reloads. Only PTE reloads are included ++event:0x000002F144 counters:1 um:zero minimum:100 name:PM_MRK_DPTEG_FROM_L31_MOD : A Page Table Entry was loaded into the TLB with Modified (M) data from another core's L3 on the same chip due to a marked data side request. When using Radix Page Translation, this count excludes PDE reloads. Only PTE reloads are included ++event:0x000001F146 counters:0 um:zero minimum:100 name:PM_MRK_DPTEG_FROM_L31_SHR : A Page Table Entry was loaded into the TLB with Shared (S) data from another core's L3 on the same chip due to a marked data side request.. When using Radix Page Translation, this count excludes PDE reloads. Only PTE reloads are included ++event:0x000003F142 counters:2 um:zero minimum:100 name:PM_MRK_DPTEG_FROM_L3_DISP_CONFLICT : A Page Table Entry was loaded into the TLB from local core's L3 with dispatch conflict due to a marked data side request. When using Radix Page Translation, this count excludes PDE reloads. Only PTE reloads are included ++event:0x000002F142 counters:1 um:zero minimum:100 name:PM_MRK_DPTEG_FROM_L3_MEPF : A Page Table Entry was loaded into the TLB from local core's L3 without dispatch conflicts hit on Mepf state. due to a marked data side request. When using Radix Page Translation, this count excludes PDE reloads. Only PTE reloads are included ++event:0x000004F14E counters:3 um:zero minimum:100 name:PM_MRK_DPTEG_FROM_L3MISS : A Page Table Entry was loaded into the TLB from a location other than the local core's L3 due to a marked data side request. When using Radix Page Translation, this count excludes PDE reloads. Only PTE reloads are included ++event:0x000001F144 counters:0 um:zero minimum:100 name:PM_MRK_DPTEG_FROM_L3_NO_CONFLICT : A Page Table Entry was loaded into the TLB from local core's L3 without conflict due to a marked data side request. When using Radix Page Translation, this count excludes PDE reloads. Only PTE reloads are included ++event:0x000001F14C counters:0 um:zero minimum:100 name:PM_MRK_DPTEG_FROM_LL4 : A Page Table Entry was loaded into the TLB from the local chip's L4 cache due to a marked data side request.. When using Radix Page Translation, this count excludes PDE reloads. Only PTE reloads are included ++event:0x000002F148 counters:1 um:zero minimum:100 name:PM_MRK_DPTEG_FROM_LMEM : A Page Table Entry was loaded into the TLB from the local chip's Memory due to a marked data side request. When using Radix Page Translation, this count excludes PDE reloads. Only PTE reloads are included ++event:0x000002F14C counters:1 um:zero minimum:100 name:PM_MRK_DPTEG_FROM_MEMORY : A Page Table Entry was loaded into the TLB from a memory location including L4 from local remote or distant due to a marked data side request. When using Radix Page Translation, this count excludes PDE reloads. Only PTE reloads are included ++event:0x000004F14A counters:3 um:zero minimum:100 name:PM_MRK_DPTEG_FROM_OFF_CHIP_CACHE : A Page Table Entry was loaded into the TLB either shared or modified data from another core's L2/L3 on a different chip (remote or distant) due to a marked data side request. When using Radix Page Translation, this count excludes PDE reloads. Only PTE reloads are included ++event:0x000001F148 counters:0 um:zero minimum:100 name:PM_MRK_DPTEG_FROM_ON_CHIP_CACHE : A Page Table Entry was loaded into the TLB either shared or modified data from another core's L2/L3 on the same chip due to a marked data side request.. When using Radix Page Translation, this count excludes PDE reloads. Only PTE reloads are included ++event:0x000002F146 counters:1 um:zero minimum:100 name:PM_MRK_DPTEG_FROM_RL2L3_MOD : A Page Table Entry was loaded into the TLB with Modified (M) data from another chip's L2 or L3 on the same Node or Group (Remote), as this chip due to a marked data side request. When using Radix Page Translation, this count excludes PDE reloads. Only PTE reloads are included ++event:0x000001F14A counters:0 um:zero minimum:100 name:PM_MRK_DPTEG_FROM_RL2L3_SHR : A Page Table Entry was loaded into the TLB with Shared (S) data from another chip's L2 or L3 on the same Node or Group (Remote), as this chip due to a marked data side request.. When using Radix Page Translation, this count excludes PDE reloads. Only PTE reloads are included ++event:0x000002F14A counters:1 um:zero minimum:100 name:PM_MRK_DPTEG_FROM_RL4 : A Page Table Entry was loaded into the TLB from another chip's L4 on the same Node or Group ( Remote) due to a marked data side request. When using Radix Page Translation, this count excludes PDE reloads. Only PTE reloads are included ++event:0x000003F14A counters:2 um:zero minimum:100 name:PM_MRK_DPTEG_FROM_RMEM : A Page Table Entry was loaded into the TLB from another chip's memory on the same Node or Group ( Remote) due to a marked data side request. When using Radix Page Translation, this count excludes PDE reloads. Only PTE reloads are included ++event:0x00000401E4 counters:3 um:zero minimum:100 name:PM_MRK_DTLB_MISS : Marked dtlb miss ++event:0x000002D15E counters:1 um:zero minimum:100 name:PM_MRK_DTLB_MISS_16G : Marked Data TLB Miss page size 16G ++event:0x000004C15E counters:3 um:zero minimum:100 name:PM_MRK_DTLB_MISS_16M : Marked Data TLB Miss page size 16M ++event:0x000001D15C counters:0 um:zero minimum:100 name:PM_MRK_DTLB_MISS_1G : Marked Data TLB reload (after a miss) page size 2M. Implies radix translation was used ++event:0x000002D156 counters:1 um:zero minimum:100 name:PM_MRK_DTLB_MISS_4K : Marked Data TLB Miss page size 4k ++event:0x000003D156 counters:2 um:zero minimum:100 name:PM_MRK_DTLB_MISS_64K : Marked Data TLB Miss page size 64K ++event:0x0000040154 counters:3 um:zero minimum:100 name:PM_MRK_FAB_RSP_BKILL : Marked store had to do a bkill ++event:0x000001F152 counters:0 um:zero minimum:100 name:PM_MRK_FAB_RSP_BKILL_CYC : cycles L2 RC took for a bkill ++event:0x000003015E counters:2 um:zero minimum:100 name:PM_MRK_FAB_RSP_CLAIM_RTY : Sampled store did a rwitm and got a rty ++event:0x0000030154 counters:2 um:zero minimum:100 name:PM_MRK_FAB_RSP_DCLAIM : Marked store had to do a dclaim ++event:0x000002F152 counters:1 um:zero minimum:100 name:PM_MRK_FAB_RSP_DCLAIM_CYC : cycles L2 RC took for a dclaim ++event:0x000004015E counters:3 um:zero minimum:100 name:PM_MRK_FAB_RSP_RD_RTY : Sampled L2 reads retry count ++event:0x000001015E counters:0 um:zero minimum:100 name:PM_MRK_FAB_RSP_RD_T_INTV : Sampled Read got a T intervention ++event:0x000004F150 counters:3 um:zero minimum:100 name:PM_MRK_FAB_RSP_RWITM_CYC : cycles L2 RC took for a rwitm ++event:0x000002015E counters:1 um:zero minimum:100 name:PM_MRK_FAB_RSP_RWITM_RTY : Sampled store did a rwitm and got a rty ++event:0x0000020134 counters:1 um:zero minimum:100 name:PM_MRK_FXU_FIN : fxu marked instr finish ++event:0x000004013A counters:3 um:zero minimum:100 name:PM_MRK_IC_MISS : Marked instruction experienced I cache miss ++event:0x0000024058 counters:1 um:zero minimum:100 name:PM_MRK_INST : An instruction was marked. Includes both Random Instruction Sampling (RIS) at decode time and Random Event Sampling (RES) at the time the configured event happens ++event:0x00000401E0 counters:3 um:zero minimum:1000 name:PM_MRK_INST_CMPL : marked instruction completed ++event:0x0000020130 counters:1 um:zero minimum:1000 name:PM_MRK_INST_DECODED : An instruction was marked at decode time. Random Instruction Sampling (RIS) only ++event:0x00000101E0 counters:0 um:zero minimum:1000 name:PM_MRK_INST_DISP : The thread has dispatched a randomly sampled marked instruction ++event:0x0000030130 counters:2 um:zero minimum:1000 name:PM_MRK_INST_FIN : marked instruction finished ++event:0x00000401E6 counters:3 um:zero minimum:1000 name:PM_MRK_INST_FROM_L3MISS : Marked instruction was reloaded from a location beyond the local chiplet ++event:0x0000010132 counters:0 um:zero minimum:1000 name:PM_MRK_INST_ISSUED : Marked instruction issued ++event:0x0000040134 counters:3 um:zero minimum:1000 name:PM_MRK_INST_TIMEO : marked Instruction finish timeout (instruction lost) ++event:0x00000101E4 counters:0 um:zero minimum:100 name:PM_MRK_L1_ICACHE_MISS : sampled Instruction suffered an icache Miss ++event:0x00000101EA counters:0 um:zero minimum:100 name:PM_MRK_L1_RELOAD_VALID : Marked demand reload ++event:0x0000020114 counters:1 um:zero minimum:100 name:PM_MRK_L2_RC_DISP : Marked Instruction RC dispatched in L2 ++event:0x000003012A counters:2 um:zero minimum:100 name:PM_MRK_L2_RC_DONE : Marked RC done ++event:0x000001E15E counters:0 um:zero minimum:100 name:PM_MRK_L2_TM_REQ_ABORT : TM abort ++event:0x000003E15C counters:2 um:zero minimum:100 name:PM_MRK_L2_TM_ST_ABORT_SISTER : TM marked store abort for this thread ++event:0x0000040116 counters:3 um:zero minimum:100 name:PM_MRK_LARX_FIN : Larx finished ++event:0x000001013E counters:0 um:zero minimum:100 name:PM_MRK_LD_MISS_EXPOSED_CYC : Marked Load exposed Miss (use edge detect to count #) ++event:0x00000201E2 counters:1 um:zero minimum:100 name:PM_MRK_LD_MISS_L1 : Marked DL1 Demand Miss counted at exec time. Note that this count is per slice, so if a load spans multiple slices this event will increment multiple times for a single load. ++event:0x000001D056 counters:0 um:zero minimum:100 name:PM_MRK_LD_MISS_L1_CYC : Marked ld latency ++event:0x0000030162 counters:2 um:zero minimum:100 name:PM_MRK_LSU_DERAT_MISS : Marked derat reload (miss) for any page size ++event:0x0000040132 counters:3 um:zero minimum:100 name:PM_MRK_LSU_FIN : lsu marked instr PPC finish ++event:0x000000D098 counters:0,1,2,3 um:zero minimum:100 name:PM_MRK_LSU_FLUSH_ATOMIC : Quad-word loads (lq) are considered atomic because they always span at least 2 slices. If a snoop or store from another thread changes the data the load is accessing between the 2 or 3 pieces of the lq instruction, the lq will be flushed ++event:0x000000D898 counters:0,1,2,3 um:zero minimum:100 name:PM_MRK_LSU_FLUSH_EMSH : An ERAT miss was detected after a set-p hit. Erat tracker indicates fail due to tlbmiss and the instruction gets flushed because the instruction was working on the wrong address ++event:0x000000D8A4 counters:0,1,2,3 um:zero minimum:100 name:PM_MRK_LSU_FLUSH_LARX_STCX : A larx is flushed because an older larx has an LMQ reservation for the same thread. A stcx is flushed because an older stcx is in the LMQ. The flush happens when the older larx/stcx relaunches ++event:0x000000D8A0 counters:0,1,2,3 um:zero minimum:100 name:PM_MRK_LSU_FLUSH_LHL_SHL : The instruction was flushed because of a sequential load/store consistency. If a load or store hits on an older load that has either been snooped (for loads) or has stale data (for stores). ++event:0x000000D0A0 counters:0,1,2,3 um:zero minimum:100 name:PM_MRK_LSU_FLUSH_LHS : Effective Address alias flush : no EA match but Real Address match. If the data has not yet been returned for this load, the instruction will just be rejected, but if it has returned data, it will be flushed ++event:0x000000D09C counters:0,1,2,3 um:zero minimum:100 name:PM_MRK_LSU_FLUSH_RELAUNCH_MISS : If a load that has already returned data and has to relaunch for any reason then gets a miss (erat, setp, data cache), it will often be flushed at relaunch time because the data might be inconsistent ++event:0x000000D0A4 counters:0,1,2,3 um:zero minimum:100 name:PM_MRK_LSU_FLUSH_SAO : A load-hit-load condition with Strong Address Ordering will have address compare disabled and flush ++event:0x000000D89C counters:0,1,2,3 um:zero minimum:100 name:PM_MRK_LSU_FLUSH_UE : Correctable ECC error on reload data, reported at critical data forward time ++event:0x000002011C counters:1 um:zero minimum:100 name:PM_MRK_NTC_CYC : Cycles during which the marked instruction is next to complete (completion is held up because the marked instruction hasn't completed yet) ++event:0x0000020112 counters:1 um:zero minimum:100 name:PM_MRK_NTF_FIN : Marked next to finish instruction finished ++event:0x000001F05E counters:0 um:zero minimum:100 name:PM_MRK_PROBE_NOP_CMPL : Marked probeNops completed ++event:0x000001D15E counters:0 um:zero minimum:1000 name:PM_MRK_RUN_CYC : Run cycles in which a marked instruction is in the pipeline ++event:0x000003013E counters:2 um:zero minimum:100 name:PM_MRK_STALL_CMPLU_CYC : Number of cycles the marked instruction is experiencing a stall while it is next to complete (NTC) ++event:0x00000301E2 counters:2 um:zero minimum:100 name:PM_MRK_ST_CMPL : Marked store completed and sent to nest ++event:0x0000030134 counters:2 um:zero minimum:100 name:PM_MRK_ST_CMPL_INT : marked store finished with intervention ++event:0x000003E158 counters:2 um:zero minimum:100 name:PM_MRK_STCX_FAIL : marked stcx failed ++event:0x0000024056 counters:1 um:zero minimum:100 name:PM_MRK_STCX_FIN : Number of marked stcx instructions finished. This includes instructions in the speculative path of a branch that may be flushed ++event:0x0000010134 counters:0 um:zero minimum:100 name:PM_MRK_ST_DONE_L2 : marked store completed in L2 ( RC machine done) ++event:0x000003F150 counters:2 um:zero minimum:100 name:PM_MRK_ST_DRAIN_TO_L2DISP_CYC : cycles to drain st from core to L2 ++event:0x000003012C counters:2 um:zero minimum:100 name:PM_MRK_ST_FWD : Marked st forwards ++event:0x000001F150 counters:0 um:zero minimum:100 name:PM_MRK_ST_L2DISP_TO_CMPL_CYC : cycles from L2 rc disp to l2 rc completion ++event:0x0000020138 counters:1 um:zero minimum:100 name:PM_MRK_ST_NEST : Marked store sent to nest ++event:0x00000028A4 counters:0,1,2,3 um:zero minimum:100 name:PM_MRK_TEND_FAIL : Nested or not nested tend failed for a marked tend instruction ++event:0x0000030132 counters:2 um:zero minimum:100 name:PM_MRK_VSU_FIN : VSU marked instr finish ++event:0x000003D15E counters:2 um:zero minimum:10000 name:PM_MULT_MRK : mult marked instr ++event:0x000003006E counters:2 um:zero minimum:10000 name:PM_NEST_REF_CLK : Multiply by 4 to obtain the number of PB cycles ++event:0x000000F8A0 counters:0,1,2,3 um:zero minimum:10000 name:PM_NON_DATA_STORE : All ops that drain from s2q to L2 and contain no data ++event:0x000004D056 counters:3 um:zero minimum:10000 name:PM_NON_FMA_FLOP_CMPL : Non FMA instruction completed ++event:0x000004D05A counters:3 um:zero minimum:10000 name:PM_NON_MATH_FLOP_CMPL : Non FLOP operation completed ++event:0x00000260A6 counters:1 um:zero minimum:10000 name:PM_NON_TM_RST_SC : Non-TM snp rst TM SC ++event:0x000002001A counters:1 um:zero minimum:10000 name:PM_NTC_ALL_FIN : Cycles after all instructions have finished to group completed ++event:0x000002405A counters:1 um:zero minimum:10000 name:PM_NTC_FIN : Cycles in which the oldest instruction in the pipeline (NTC) finishes. This event is used to account for cycles in which work is being completed in the CPI stack ++event:0x000002E016 counters:1 um:zero minimum:10000 name:PM_NTC_ISSUE_HELD_ARB : The NTC instruction is being held at dispatch because it lost arbitration onto the issue pipe to another instruction (from the same thread or a different thread) ++event:0x000001006A counters:0 um:zero minimum:10000 name:PM_NTC_ISSUE_HELD_DARQ_FULL : The NTC instruction is being held at dispatch because there are no slots in the DARQ for it ++event:0x000003D05A counters:2 um:zero minimum:10000 name:PM_NTC_ISSUE_HELD_OTHER : The NTC instruction is being held at dispatch during regular pipeline cycles, or because the VSU is busy with multi-cycle instructions, or because of a write-back collision with VSU ++event:0x0000034054 counters:2 um:zero minimum:10000 name:PM_PARTIAL_ST_FIN : Any store finished by an LSU slice ++event:0x0000020010 counters:1 um:zero minimum:10000 name:PM_PMC1_OVERFLOW : Overflow from counter 1 ++event:0x000004D02C counters:3 um:zero minimum:10000 name:PM_PMC1_REWIND : PMC1_REWIND ++event:0x000004D010 counters:3 um:zero minimum:10000 name:PM_PMC1_SAVED : PMC1 Rewind Value saved ++event:0x0000030010 counters:2 um:zero minimum:10000 name:PM_PMC2_OVERFLOW : Overflow from counter 2 ++event:0x0000030020 counters:2 um:zero minimum:10000 name:PM_PMC2_REWIND : PMC2 Rewind Event (did not match condition) ++event:0x0000010022 counters:0 um:zero minimum:10000 name:PM_PMC2_SAVED : PMC2 Rewind Value saved ++event:0x0000040010 counters:3 um:zero minimum:10000 name:PM_PMC3_OVERFLOW : Overflow from counter 3 ++event:0x000001000A counters:0 um:zero minimum:10000 name:PM_PMC3_REWIND : PMC3 rewind event. A rewind happens when a speculative event (such as latency or CPI stack) is selected on PMC3 and the stall reason or reload source did not match the one programmed in PMC3. When this occurs, the count in PMC3 will not change. ++event:0x000004D012 counters:3 um:zero minimum:10000 name:PM_PMC3_SAVED : PMC3 Rewind Value saved ++event:0x0000010010 counters:0 um:zero minimum:10000 name:PM_PMC4_OVERFLOW : Overflow from counter 4 ++event:0x0000010020 counters:0 um:zero minimum:10000 name:PM_PMC4_REWIND : PMC4 Rewind Event ++event:0x0000030022 counters:2 um:zero minimum:10000 name:PM_PMC4_SAVED : PMC4 Rewind Value saved (matched condition) ++event:0x0000010024 counters:0 um:zero minimum:10000 name:PM_PMC5_OVERFLOW : Overflow from counter 5 ++event:0x0000030024 counters:2 um:zero minimum:10000 name:PM_PMC6_OVERFLOW : Overflow from counter 6 ++event:0x0000040014 counters:3 um:zero minimum:10000 name:PM_PROBE_NOP_DISP : ProbeNops dispatched ++event:0x000000F084 counters:0,1,2,3 um:zero minimum:10000 name:PM_PTE_PREFETCH : PTE prefetches ++event:0x000000589C counters:0,1,2,3 um:zero minimum:10000 name:PM_PTESYNC : ptesync instruction counted when the instruction is decoded and transmitted ++event:0x0000010054 counters:0 um:zero minimum:10000 name:PM_PUMP_CPRED : Pump prediction correct. Counts across all types of pumps for all data types excluding data prefetch (demand load,inst prefetch,inst fetch,xlate) ++event:0x0000040052 counters:3 um:zero minimum:10000 name:PM_PUMP_MPRED : Pump misprediction. Counts across all types of pumps for all data types excluding data prefetch (demand load,inst prefetch,inst fetch,xlate) ++event:0x000001F056 counters:0 um:zero minimum:10000 name:PM_RADIX_PWC_L1_HIT : A radix translation attempt missed in the TLB and only the first level page walk cache was a hit. ++event:0x000002D026 counters:1 um:zero minimum:10000 name:PM_RADIX_PWC_L1_PDE_FROM_L2 : A Page Directory Entry was reloaded to a level 1 page walk cache from the core's L2 data cache ++event:0x000003F058 counters:2 um:zero minimum:10000 name:PM_RADIX_PWC_L1_PDE_FROM_L3 : A Page Directory Entry was reloaded to a level 1 page walk cache from the core's L3 data cache ++event:0x000004F056 counters:3 um:zero minimum:10000 name:PM_RADIX_PWC_L1_PDE_FROM_L3MISS : A Page Directory Entry was reloaded to a level 1 page walk cache from beyond the core's L3 data cache. The source could be local/remote/distant memory or another core's cache ++event:0x000002D024 counters:1 um:zero minimum:10000 name:PM_RADIX_PWC_L2_HIT : A radix translation attempt missed in the TLB but hit on both the first and second levels of page walk cache. ++event:0x000002D028 counters:1 um:zero minimum:10000 name:PM_RADIX_PWC_L2_PDE_FROM_L2 : A Page Directory Entry was reloaded to a level 2 page walk cache from the core's L2 data cache ++event:0x000003F05A counters:2 um:zero minimum:10000 name:PM_RADIX_PWC_L2_PDE_FROM_L3 : A Page Directory Entry was reloaded to a level 2 page walk cache from the core's L3 data cache ++event:0x000001F058 counters:0 um:zero minimum:10000 name:PM_RADIX_PWC_L2_PTE_FROM_L2 : A Page Table Entry was reloaded to a level 2 page walk cache from the core's L2 data cache. This implies that level 3 and level 4 PWC accesses were not necessary for this translation ++event:0x000004F058 counters:3 um:zero minimum:10000 name:PM_RADIX_PWC_L2_PTE_FROM_L3 : A Page Table Entry was reloaded to a level 2 page walk cache from the core's L3 data cache. This implies that level 3 and level 4 PWC accesses were not necessary for this translation ++event:0x000004F05C counters:3 um:zero minimum:10000 name:PM_RADIX_PWC_L2_PTE_FROM_L3MISS : A Page Table Entry was reloaded to a level 2 page walk cache from beyond the core's L3 data cache. This implies that level 3 and level 4 PWC accesses were not necessary for this translation. The source could be local/remote/distant memory or another core's cache ++event:0x000003F056 counters:2 um:zero minimum:10000 name:PM_RADIX_PWC_L3_HIT : A radix translation attempt missed in the TLB but hit on the first, second, and third levels of page walk cache. ++event:0x000002D02A counters:1 um:zero minimum:10000 name:PM_RADIX_PWC_L3_PDE_FROM_L2 : A Page Directory Entry was reloaded to a level 3 page walk cache from the core's L2 data cache ++event:0x000001F15C counters:0 um:zero minimum:10000 name:PM_RADIX_PWC_L3_PDE_FROM_L3 : A Page Directory Entry was reloaded to a level 3 page walk cache from the core's L3 data cache ++event:0x000002D02E counters:1 um:zero minimum:10000 name:PM_RADIX_PWC_L3_PTE_FROM_L2 : A Page Table Entry was reloaded to a level 3 page walk cache from the core's L2 data cache. This implies that a level 4 PWC access was not necessary for this translation ++event:0x000003F05E counters:2 um:zero minimum:10000 name:PM_RADIX_PWC_L3_PTE_FROM_L3 : A Page Table Entry was reloaded to a level 3 page walk cache from the core's L3 data cache. This implies that a level 4 PWC access was not necessary for this translation ++event:0x000004F05E counters:3 um:zero minimum:10000 name:PM_RADIX_PWC_L3_PTE_FROM_L3MISS : A Page Table Entry was reloaded to a level 3 page walk cache from beyond the core's L3 data cache. This implies that a level 4 PWC access was not necessary for this translation. The source could be local/remote/distant memory or another core's cache ++event:0x000001F05A counters:0 um:zero minimum:10000 name:PM_RADIX_PWC_L4_PTE_FROM_L2 : A Page Table Entry was reloaded to a level 4 page walk cache from the core's L2 data cache. This is the deepest level of PWC possible for a translation ++event:0x000004F05A counters:3 um:zero minimum:10000 name:PM_RADIX_PWC_L4_PTE_FROM_L3 : A Page Table Entry was reloaded to a level 4 page walk cache from the core's L3 data cache. This is the deepest level of PWC possible for a translation ++event:0x000003F054 counters:2 um:zero minimum:10000 name:PM_RADIX_PWC_L4_PTE_FROM_L3MISS : A Page Table Entry was reloaded to a level 4 page walk cache from beyond the core's L3 data cache. This is the deepest level of PWC possible for a translation. The source could be local/remote/distant memory or another core's cache ++event:0x000004F054 counters:3 um:zero minimum:10000 name:PM_RADIX_PWC_MISS : A radix translation attempt missed in the TLB and all levels of page walk cache. ++event:0x000001608C counters:0 um:zero minimum:10000 name:PM_RC0_BUSY : RC mach 0 Busy. Used by PMU to sample ave RC lifetime (mach0 used as sample point) ++event:0x000002608C counters:1 um:zero minimum:10000 name:PM_RC0_BUSY : RC mach 0 Busy. Used by PMU to sample ave RC lifetime (mach0 used as sample point) ++event:0x000001688C counters:0 um:zero minimum:10000 name:PM_RC_USAGE : Continuous 16 cycle (2to1) window where this signals rotates thru sampling each RC machine busy. PMU uses this wave to then do 16 cyc count to sample total number of machs running ++event:0x00000468A6 counters:3 um:zero minimum:10000 name:PM_RD_CLEARING_SC : Read clearing SC ++event:0x00000460A6 counters:3 um:zero minimum:10000 name:PM_RD_FORMING_SC : Read forming SC ++event:0x00000268A8 counters:1 um:zero minimum:10000 name:PM_RD_HIT_PF : RD machine hit L3 PF machine ++event:0x0000000 counters:5 um:zero minimum:10000 name:PM_RUN_CYC : Run_cycles ++event:0x00000200F4 counters:1 um:zero minimum:10000 name:PM_RUN_CYC : Run_cycles ++event:0x000003006C counters:2 um:zero minimum:100000 name:PM_RUN_CYC_SMT2_MODE : Cycles in which this thread's run latch is set and the core is in SMT2 mode ++event:0x000002006C counters:1 um:zero minimum:100000 name:PM_RUN_CYC_SMT4_MODE : Cycles in which this thread's run latch is set and the core is in SMT4 mode ++event:0x000001006C counters:0 um:zero minimum:100000 name:PM_RUN_CYC_ST_MODE : Cycles run latch is set and core is in ST mode ++event:0x0000000 counters:4 um:zero minimum:10000 name:PM_RUN_INST_CMPL : Run_Instructions ++event:0x00000400FA counters:3 um:zero minimum:10000 name:PM_RUN_INST_CMPL : Run_Instructions ++event:0x00000400F4 counters:3 um:zero minimum:10000 name:PM_RUN_PURR : Run_PURR ++event:0x0000010008 counters:0 um:zero minimum:10000 name:PM_RUN_SPURR : Run SPURR ++event:0x000000E080 counters:0,1,2,3 um:zero minimum:10000 name:PM_S2Q_FULL : Cycles during which the S2Q is full ++event:0x0000045056 counters:3 um:zero minimum:10000 name:PM_SCALAR_FLOP_CMPL : Scalar flop operation completed ++event:0x000000508C counters:0,1,2,3 um:zero minimum:10000 name:PM_SHL_CREATED : Store-Hit-Load Table Entry Created ++event:0x000000588C counters:0,1,2,3 um:zero minimum:10000 name:PM_SHL_ST_DEP_CREATED : Store-Hit-Load Table Read Hit with entry Enabled ++event:0x0000005090 counters:0,1,2,3 um:zero minimum:10000 name:PM_SHL_ST_DISABLE : Store-Hit-Load Table Read Hit with entry Disabled (entry was disabled due to the entry shown to not prevent the flush) ++event:0x000000F09C counters:0,1,2,3 um:zero minimum:10000 name:PM_SLB_TABLEWALK_CYC : Cycles when a tablewalk is pending on this thread on the SLB table ++event:0x0000016090 counters:0 um:zero minimum:10000 name:PM_SN0_BUSY : SN mach 0 Busy. Used by PMU to sample ave SN lifetime (mach0 used as sample point) ++event:0x0000026090 counters:1 um:zero minimum:10000 name:PM_SN0_BUSY : SN mach 0 Busy. Used by PMU to sample ave SN lifetime (mach0 used as sample point) ++event:0x00000460A8 counters:3 um:zero minimum:10000 name:PM_SN_HIT : Any port snooper hit L3. Up to 4 can happen in a cycle but we only count 1 ++event:0x00000368A8 counters:2 um:zero minimum:10000 name:PM_SN_INVL : Any port snooper detects a store to a line in the Sx state and invalidates the line. Up to 4 can happen in a cycle but we only count 1 ++event:0x00000468A8 counters:3 um:zero minimum:10000 name:PM_SN_MISS : Any port snooper L3 miss or collision. Up to 4 can happen in a cycle but we only count 1 ++event:0x000000F880 counters:0,1,2,3 um:zero minimum:10000 name:PM_SNOOP_TLBIE : TLBIE snoop ++event:0x00000360A6 counters:2 um:zero minimum:10000 name:PM_SNP_TM_HIT_M : Snp TM st hit M/Mu ++event:0x00000368A6 counters:2 um:zero minimum:10000 name:PM_SNP_TM_HIT_T : Snp TM sthit T/Tn/Te ++event:0x000003688C counters:2 um:zero minimum:10000 name:PM_SN_USAGE : Continuous 16 cycle (2to1) window where this signals rotates thru sampling each SN machine busy. PMU uses this wave to then do 16 cyc count to sample total number of machs running ++event:0x0000040062 counters:3 um:zero minimum:10000 name:PM_SPACEHOLDER_0x0000040062 : SPACE_HOLDER for event 0x0000040062 ++event:0x0000040064 counters:3 um:zero minimum:10000 name:PM_SPACEHOLDER_0x0000040064 : SPACE_HOLDER for event 0x0000040064 ++event:0x000004505A counters:3 um:zero minimum:10000 name:PM_SP_FLOP_CMPL : SP instruction completed ++event:0x0000040008 counters:3 um:zero minimum:10000 name:PM_SRQ_EMPTY_CYC : Cycles in which the SRQ has at least one (out of four) empty slice ++event:0x000000D0AC counters:0,1,2,3 um:zero minimum:10000 name:PM_SRQ_SYNC_CYC : A sync is in the S2Q (edge detect to count) ++event:0x0000010028 counters:0 um:zero minimum:10000 name:PM_STALL_END_ICT_EMPTY : The number a times the core transitioned from a stall to ICT-empty for this thread ++event:0x000001608E counters:0 um:zero minimum:10000 name:PM_ST_CAUSED_FAIL : Non-TM Store caused any thread to fail ++event:0x00000200F0 counters:1 um:zero minimum:10000 name:PM_ST_CMPL : Stores completed from S2Q (2nd-level store queue). ++event:0x000001E058 counters:0 um:zero minimum:10000 name:PM_STCX_FAIL : stcx failed ++event:0x000002E014 counters:1 um:zero minimum:10000 name:PM_STCX_FIN : Number of stcx instructions finished. This includes instructions in the speculative path of a branch that may be flushed ++event:0x000000C8BC counters:0,1,2,3 um:zero minimum:10000 name:PM_STCX_SUCCESS_CMPL : Number of stcx instructions that completed successfully ++event:0x0000020016 counters:1 um:zero minimum:10000 name:PM_ST_FIN : Store finish count. Includes speculative activity ++event:0x0000020018 counters:1 um:zero minimum:10000 name:PM_ST_FWD : Store forwards that finished ++event:0x00000300F0 counters:2 um:zero minimum:10000 name:PM_ST_MISS_L1 : Store Missed L1 ++event:0x00000048A4 counters:0,1,2,3 um:zero minimum:10000 name:PM_STOP_FETCH_PENDING_CYC : Fetching is stopped due to an incoming instruction that will result in a flush ++event:0x0000010000 counters:0 um:zero minimum:10000 name:PM_SUSPENDED : Counter OFF ++event:0x0000020000 counters:1 um:zero minimum:10000 name:PM_SUSPENDED : Counter OFF ++event:0x0000030000 counters:2 um:zero minimum:10000 name:PM_SUSPENDED : Counter OFF ++event:0x0000040000 counters:3 um:zero minimum:10000 name:PM_SUSPENDED : Counter OFF ++event:0x0000015152 counters:0 um:zero minimum:10000 name:PM_SYNC_MRK_BR_LINK : Marked Branch and link branch that can cause a synchronous interrupt ++event:0x000001515C counters:0 um:zero minimum:10000 name:PM_SYNC_MRK_BR_MPRED : Marked Branch mispredict that can cause a synchronous interrupt ++event:0x0000015156 counters:0 um:zero minimum:10000 name:PM_SYNC_MRK_FX_DIVIDE : Marked fixed point divide that can cause a synchronous interrupt ++event:0x0000015158 counters:0 um:zero minimum:10000 name:PM_SYNC_MRK_L2HIT : Marked L2 Hits that can throw a synchronous interrupt ++event:0x000001515A counters:0 um:zero minimum:10000 name:PM_SYNC_MRK_L2MISS : Marked L2 Miss that can throw a synchronous interrupt ++event:0x0000015154 counters:0 um:zero minimum:10000 name:PM_SYNC_MRK_L3MISS : Marked L3 misses that can throw a synchronous interrupt ++event:0x0000015150 counters:0 um:zero minimum:10000 name:PM_SYNC_MRK_PROBE_NOP : Marked probeNops which can cause synchronous interrupts ++event:0x0000030050 counters:2 um:zero minimum:10000 name:PM_SYS_PUMP_CPRED : Initial and Final Pump Scope was system pump for all data types excluding data prefetch (demand load,inst prefetch,inst fetch,xlate) ++event:0x0000030052 counters:2 um:zero minimum:10000 name:PM_SYS_PUMP_MPRED : Final Pump Scope (system) mispredicted. Either the original scope was too small (Chip/Group) or the original scope was System and it should have been smaller. Counts for all data types excluding data prefetch (demand load,inst prefetch,inst fetch,xlate) ++event:0x0000040050 counters:3 um:zero minimum:10000 name:PM_SYS_PUMP_MPRED_RTY : Final Pump Scope (system) ended up larger than Initial Pump Scope (Chip/Group) for all data types excluding data prefetch (demand load,inst prefetch,inst fetch,xlate) ++event:0x0000010026 counters:0 um:zero minimum:10000 name:PM_TABLEWALK_CYC : Cycles when an instruction tablewalk is active ++event:0x000000F884 counters:0,1,2,3 um:zero minimum:10000 name:PM_TABLEWALK_CYC_PREF : tablewalk qualified for pte prefetches ++event:0x00000058B4 counters:0,1,2,3 um:zero minimum:10000 name:PM_TAGE_CORRECT : The TAGE overrode BHT direction prediction and it was correct. Includes taken and not taken and is counted at execution time ++event:0x00000050B4 counters:0,1,2,3 um:zero minimum:10000 name:PM_TAGE_CORRECT_TAKEN_CMPL : The TAGE overrode BHT direction prediction and it was correct. Counted at completion for taken branches only ++event:0x00000050B8 counters:0,1,2,3 um:zero minimum:10000 name:PM_TAGE_OVERRIDE_WRONG : The TAGE overrode BHT direction prediction but it was incorrect. Counted at completion for taken branches only ++event:0x00000058B8 counters:0,1,2,3 um:zero minimum:10000 name:PM_TAGE_OVERRIDE_WRONG_SPEC : The TAGE overrode BHT direction prediction and it was correct. Includes taken and not taken and is counted at execution time ++event:0x0000020056 counters:1 um:zero minimum:10000 name:PM_TAKEN_BR_MPRED_CMPL : Total number of taken branches that were incorrectly predicted as not-taken. This event counts branches completed and does not include speculative instructions ++event:0x00000300F8 counters:2 um:zero minimum:10000 name:PM_TB_BIT_TRANS : timebase event ++event:0x000000E8B0 counters:0,1,2,3 um:zero minimum:10000 name:PM_TEND_PEND_CYC : TEND latency per thread ++event:0x000002000C counters:1 um:zero minimum:100000 name:PM_THRD_ALL_RUN_CYC : Cycles in which all the threads have the run latch set ++event:0x00000300F4 counters:2 um:zero minimum:10000 name:PM_THRD_CONC_RUN_INST : PPC Instructions Finished by this thread when all threads in the core had the run-latch set ++event:0x00000040BC counters:0,1,2,3 um:zero minimum:1000 name:PM_THRD_PRIO_0_1_CYC : Cycles thread running at priority level 0 or 1 ++event:0x00000048BC counters:0,1,2,3 um:zero minimum:1000 name:PM_THRD_PRIO_2_3_CYC : Cycles thread running at priority level 2 or 3 ++event:0x0000005080 counters:0,1,2,3 um:zero minimum:1000 name:PM_THRD_PRIO_4_5_CYC : Cycles thread running at priority level 4 or 5 ++event:0x0000005880 counters:0,1,2,3 um:zero minimum:1000 name:PM_THRD_PRIO_6_7_CYC : Cycles thread running at priority level 6 or 7 ++event:0x0000024154 counters:1 um:zero minimum:10000 name:PM_THRESH_ACC : This event increments every time the threshold event counter ticks. Thresholding must be enabled (via MMCRA) and the thresholding start event must occur for this counter to increment. It will stop incrementing when the thresholding stop event occurs or when thresholding is disabled, until the next time a configured thresholding start event occurs. ++event:0x00000301EA counters:2 um:zero minimum:10000 name:PM_THRESH_EXC_1024 : Threshold counter exceeded a value of 1024 ++event:0x00000401EA counters:3 um:zero minimum:10000 name:PM_THRESH_EXC_128 : Threshold counter exceeded a value of 128 ++event:0x00000401EC counters:3 um:zero minimum:10000 name:PM_THRESH_EXC_2048 : Threshold counter exceeded a value of 2048 ++event:0x00000101E8 counters:0 um:zero minimum:10000 name:PM_THRESH_EXC_256 : Threshold counter exceed a count of 256 ++event:0x00000201E6 counters:1 um:zero minimum:10000 name:PM_THRESH_EXC_32 : Threshold counter exceeded a value of 32 ++event:0x00000101E6 counters:0 um:zero minimum:10000 name:PM_THRESH_EXC_4096 : Threshold counter exceed a count of 4096 ++event:0x00000201E8 counters:1 um:zero minimum:10000 name:PM_THRESH_EXC_512 : Threshold counter exceeded a value of 512 ++event:0x00000301E8 counters:2 um:zero minimum:10000 name:PM_THRESH_EXC_64 : Threshold counter exceeded a value of 64 ++event:0x00000101EC counters:0 um:zero minimum:10000 name:PM_THRESH_MET : threshold exceeded ++event:0x000004016E counters:3 um:zero minimum:10000 name:PM_THRESH_NOT_MET : Threshold counter did not meet threshold ++event:0x000001F054 counters:0 um:zero minimum:10000 name:PM_TLB_HIT : Number of times the TLB had the data required by the instruction. Applies to both HPT and RPT ++event:0x0000030058 counters:2 um:zero minimum:10000 name:PM_TLBIE_FIN : tlbie finished ++event:0x0000020066 counters:1 um:zero minimum:10000 name:PM_TLB_MISS : TLB Miss (I + D) ++event:0x0000030056 counters:2 um:zero minimum:10000 name:PM_TM_ABORTS : Number of TM transactions aborted ++event:0x000000E0A4 counters:0,1,2,3 um:zero minimum:10000 name:PM_TMA_REQ_L2 : addrs only req to L2 only on the first one,Indication that Load footprint is not expanding ++event:0x00000168A6 counters:0 um:zero minimum:10000 name:PM_TM_CAM_OVERFLOW : L3 TM cam overflow during L2 co of SC ++event:0x000004608E counters:3 um:zero minimum:10000 name:PM_TM_CAP_OVERFLOW : TM Footprint Capacity Overflow ++event:0x00000028A8 counters:0,1,2,3 um:zero minimum:10000 name:PM_TM_FAIL_CONF_NON_TM : TM aborted because a conflict occurred with a non-transactional access by another processor ++event:0x00000020AC counters:0,1,2,3 um:zero minimum:10000 name:PM_TM_FAIL_CONF_TM : TM aborted because a conflict occurred with another transaction. ++event:0x00000020A8 counters:0,1,2,3 um:zero minimum:10000 name:PM_TM_FAIL_FOOTPRINT_OVERFLOW : TM aborted because the tracking limit for transactional storage accesses was exceeded.. Asynchronous ++event:0x000000E0B0 counters:0,1,2,3 um:zero minimum:10000 name:PM_TM_FAIL_NON_TX_CONFLICT : Non transactional conflict from LSU, gets reported to TEXASR ++event:0x00000028AC counters:0,1,2,3 um:zero minimum:10000 name:PM_TM_FAIL_SELF : TM aborted because a self-induced conflict occurred in Suspended state, due to one of the following: a store to a storage location that was previously accessed transactionally ++event:0x000000E0AC counters:0,1,2,3 um:zero minimum:10000 name:PM_TM_FAIL_TLBIE : Transaction failed because there was a TLBIE hit in the bloom filter ++event:0x000000E8AC counters:0,1,2,3 um:zero minimum:10000 name:PM_TM_FAIL_TX_CONFLICT : Transactional conflict from LSU, gets reported to TEXASR ++event:0x000002688E counters:1 um:zero minimum:10000 name:PM_TM_FAV_CAUSED_FAIL : TM Load (fav) caused another thread to fail ++event:0x000000209C counters:0,1,2,3 um:zero minimum:10000 name:PM_TM_FAV_TBEGIN : Dispatch time Favored tbegin ++event:0x000001688E counters:0 um:zero minimum:10000 name:PM_TM_LD_CAUSED_FAIL : Non-TM Load caused any thread to fail ++event:0x000002608E counters:1 um:zero minimum:10000 name:PM_TM_LD_CONF : TM Load (fav or non-fav) ran into conflict (failed) ++event:0x00000020A0 counters:0,1,2,3 um:zero minimum:10000 name:PM_TM_NESTED_TBEGIN : Completion Tm nested tbegin ++event:0x0000002098 counters:0,1,2,3 um:zero minimum:10000 name:PM_TM_NESTED_TEND : Completion time nested tend ++event:0x000000289C counters:0,1,2,3 um:zero minimum:10000 name:PM_TM_NON_FAV_TBEGIN : Dispatch time non favored tbegin ++event:0x0000002094 counters:0,1,2,3 um:zero minimum:10000 name:PM_TM_OUTER_TBEGIN : Completion time outer tbegin ++event:0x000004E05E counters:3 um:zero minimum:10000 name:PM_TM_OUTER_TBEGIN_DISP : Number of outer tbegin instructions dispatched. The dispatch unit determines whether the tbegin instruction is outer or nested. This is a speculative count, which includes flushed instructions ++event:0x0000002894 counters:0,1,2,3 um:zero minimum:10000 name:PM_TM_OUTER_TEND : Completion time outer tend ++event:0x000002E052 counters:1 um:zero minimum:10000 name:PM_TM_PASSED : Number of TM transactions that passed ++event:0x00000268A6 counters:1 um:zero minimum:10000 name:PM_TM_RST_SC : TM-snp rst RM SC ++event:0x00000160A6 counters:0 um:zero minimum:10000 name:PM_TM_SC_CO : L3 castout TM SC line ++event:0x000003688E counters:2 um:zero minimum:10000 name:PM_TM_ST_CAUSED_FAIL : TM Store (fav or non-fav) caused another thread to fail ++event:0x000003608E counters:2 um:zero minimum:10000 name:PM_TM_ST_CONF : TM Store (fav or non-fav) ran into conflict (failed) ++event:0x0000002898 counters:0,1,2,3 um:zero minimum:10000 name:PM_TM_TABORT_TRECLAIM : Completion time tabortnoncd, tabortcd, treclaim ++event:0x0000010060 counters:0 um:zero minimum:10000 name:PM_TM_TRANS_RUN_CYC : run cycles in transactional state ++event:0x0000030060 counters:2 um:zero minimum:10000 name:PM_TM_TRANS_RUN_INST : Run instructions completed in transactional state (gated by the run latch) ++event:0x00000020A4 counters:0,1,2,3 um:zero minimum:10000 name:PM_TM_TRESUME : TM resume instruction completed ++event:0x00000028A0 counters:0,1,2,3 um:zero minimum:10000 name:PM_TM_TSUSPEND : TM suspend instruction completed ++event:0x000002E012 counters:1 um:zero minimum:10000 name:PM_TM_TX_PASS_RUN_CYC : cycles spent in successful transactions ++event:0x000004E014 counters:3 um:zero minimum:10000 name:PM_TM_TX_PASS_RUN_INST : Run instructions spent in successful transactions ++event:0x000004D058 counters:3 um:zero minimum:10000 name:PM_VECTOR_FLOP_CMPL : Vector FP instruction completed ++event:0x0000044054 counters:3 um:zero minimum:10000 name:PM_VECTOR_LD_CMPL : Number of vector load instructions completed ++event:0x0000044056 counters:3 um:zero minimum:10000 name:PM_VECTOR_ST_CMPL : Number of vector store instructions completed ++event:0x000003D058 counters:2 um:zero minimum:10000 name:PM_VSU_DP_FSQRT_FDIV : vector versions of fdiv,fsqrt ++event:0x000002505C counters:1 um:zero minimum:10000 name:PM_VSU_FIN : VSU instruction finished. Up to 4 per cycle ++event:0x000004D04E counters:3 um:zero minimum:10000 name:PM_VSU_FSQRT_FDIV : four flops operation (fdiv,fsqrt) Scalar Instructions only ++event:0x000004D050 counters:3 um:zero minimum:10000 name:PM_VSU_NON_FLOP_CMPL : Non FLOP operation completed ++event:0x000000F098 counters:0,1,2,3 um:zero minimum:10000 name:PM_XLATE_HPT_MODE : LSU reports every cycle the thread is in HPT translation mode (as opposed to radix mode) ++event:0x000000F89C counters:0,1,2,3 um:zero minimum:10000 name:PM_XLATE_MISS : The LSU requested a line from L2 for translation. It may be satisfied from any source beyond L2. Includes speculative instructions ++event:0x000000F898 counters:0,1,2,3 um:zero minimum:10000 name:PM_XLATE_RADIX_MODE : LSU reports every cycle the thread is in radix translation mode (as opposed to HPT mode) +commit eb8852f7d60c05f7c1bcb4de5b9b40090fd42238 +Author: Will Schmidt +Date: Fri Jun 9 14:29:51 2017 -0500 + + oprofile update pointers to documentation for Power + + Hi, + Update the documentation pointers to the current home + for the Power ISA and P8 Users Manual. + + Signed-off-by: Will Schmidt + + Thanks + -Will + + -- + +diff --git a/utils/ophelp.c b/utils/ophelp.c +index f76bf2a..9a2a7dd 100644 +--- a/utils/ophelp.c ++++ b/utils/ophelp.c +@@ -723,7 +723,10 @@ int main(int argc, char const * argv[]) + case CPU_PPC64_POWER8: + event_doc = + "This processor type is fully supported with operf; opcontrol timer mode may be available.\n" +- "See Power ISA 2.07 at https://www.power.org/\n\n"; ++ "See Power ISA 3.0B at " ++ "https://openpowerfoundation.org/?submit=Search&s=ISA \n" ++ "And the P8 Users Manual at " ++ "https://www-355.ibm.com/systems/power/openpower \n\n"; + break; + + case CPU_PPC64_CELL: +@@ -735,7 +739,8 @@ int main(int argc, char const * argv[]) + case CPU_PPC64_POWER9: + event_doc = + "This processor type is fully supported with operf.\n" +- "See Power ISA 3.0 at https://www.power.org/\n\n"; ++ "See Power ISA 3.0B at " ++ "https://openpowerfoundation.org/?resource_lib=power-isa-version-3-0\n"; + break; + + case CPU_MIPS_20K: diff --git a/SOURCES/oprofile-ppc64-equivalent.patch b/SOURCES/oprofile-ppc64-equivalent.patch new file mode 100644 index 0000000..3c5b063 --- /dev/null +++ b/SOURCES/oprofile-ppc64-equivalent.patch @@ -0,0 +1,140 @@ +commit 4f5a0d9c4419f3b88586d665272eb35f270a0551 +Author: Maynard Johnson +Date: Tue Dec 17 16:04:33 2013 -0600 + + Allow all native events for IBM POWER8 in POWER7 compat mode + + Certain older Linux distributions will support the new IBM POWER8 + processor, but only in a limited mode, since much of the new + kernel code needed to fully support the POWER8 was not backported + to these older distros. This limited mode is referred to as + "POWER7 compat mode" since the kernel can support only the features + that were also available on that earlier IBM processor. + + Changes I originally made to support POWER8 assumed that there + would not be full POWER8 performance monitor unit capabilities when + in POWER7 compat mode, and thus, the current oprofile code supports + only a limited subset of POWER8 events (i.e., events which were also + available on the POWER7). However, I've recently been made aware + that these older distros actually do have complete backports of the + POWER8 perf_events kernel subsystem code, making them fully aware of + all POWER8 events. This patch allows operf and ocount to use all + of the POWER8 events, regardless of what mode or distribution we + are running on. + + Signed-off-by: Maynard Johnson + +diff --git a/libop/op_cpu_type.c b/libop/op_cpu_type.c +index 4bb34b7..cd75ad4 100644 +--- a/libop/op_cpu_type.c ++++ b/libop/op_cpu_type.c +@@ -290,7 +290,16 @@ static op_cpu _try_ppc64_arch_generic_cpu(void) + } + } + if (!platforms_are_equivalent) { +- if (strcmp(platform, "power7") == 0) ++ // FIXME ++ /* For POWER8 running in POWER7 compat mode (RHEL 6.5 and SLES 11 SP4), ++ * the kernel will have enough POWER8-specific PMU code so we can utilize ++ * all of the POWER8 events. In general, this is not necessarily the case ++ * when running in compat mode. This code needs to be inspected for every ++ * new IBM Power processor released, but for now, we'll assume that for the ++ * next processor model (assuming there will be something like a POWER9?), ++ * we should use just the architected events when running POWER8 compat mode. ++ */ ++ if (strcmp(platform, "power8") == 0) + cpu_type = CPU_PPC64_ARCH_V1; + } + } +commit 88ed74bade0096042d643a6d7e68c2cbc4b6e34d +Author: Maynard Johnson +Date: Thu Jan 9 15:07:21 2014 -0600 + + Fix "Unable to open cpu_type file for reading" for IBM POWER7+ + + Using operf to do profiling on an IBM POWER7+ may result in + the following error message: + + Unable to open cpu_type file for reading + + This patch fixes the problem. There is also a simple workaround of + running 'opcontrol --init'. + + Signed-off-by: Maynard Johnson + +diff --git a/libop/op_cpu_type.c b/libop/op_cpu_type.c +index cd75ad4..7d5262c 100644 +--- a/libop/op_cpu_type.c ++++ b/libop/op_cpu_type.c +@@ -326,6 +326,8 @@ static op_cpu _get_ppc64_cpu_type(void) + for (i = 0; i < (int)len ; i++) + cpu_name_lowercase[i] = tolower(cpu_name[i]); + ++ if (strncmp(cpu_name_lowercase, "power7+", 7) == 0) ++ cpu_name_lowercase[6] = '\0'; + cpu_type_str[0] = '\0'; + strcat(cpu_type_str, "ppc64/"); + strncat(cpu_type_str, cpu_name_lowercase, len); +commit 65176cb1af0fb1f6c7d3ddba4ab5f5f23c5f7c62 +Author: Maynard Johnson +Date: Tue Jan 21 14:43:02 2014 -0600 + + Fix regression in IBM POWER8 running in POWER7 compat mode + + A commit made on Dec 17, 2013 ("Allow all native events for IBM POWER8 + in POWER7 compat mode) broke support for POWER8 in POWER7 compat mode. + Instead, oprofile attempts to treat it as a normal POWER7 processor, + which is not correct. A user reported the following error when + running operf with the default CYCLES event: + + terminate called after throwing an instance of 'std::runtime_error' + what(): libpfm cannot find event code for CYCLES; cannot continue + Aborted + + This patch fixes this problem. + + Signed-off-by: Maynard Johnson + +diff --git a/libop/op_cpu_type.c b/libop/op_cpu_type.c +index a3ad804..2907f36 100644 +--- a/libop/op_cpu_type.c ++++ b/libop/op_cpu_type.c +@@ -300,7 +300,9 @@ static op_cpu _try_ppc64_arch_generic_cpu(void) + * next processor model (assuming there will be something like a POWER9?), + * we should use just the architected events when running POWER8 compat mode. + */ +- if (strcmp(platform, "power8") == 0) ++ if ((strcmp(platform, "power7") == 0) && (strcmp(base_platform, "power8") == 0)) ++ cpu_type = CPU_PPC64_POWER8; ++ else + cpu_type = CPU_PPC64_ARCH_V1; + } + } +commit 7243fa4ed8a25c6e59225a863fd263ce70989087 +Author: Maynard Johnson +Date: Tue Feb 4 08:27:10 2014 -0600 + + Make cpu type POWER8E equivalent to POWER8 + + Recent mainline kernel changes resulted in a cpu type of + "POWER8E" being displayed in /proc/cpuinfo for certain revisions + of the IBM POWER8 processor model. But for profiling and + counting of native events, we can ignore the differences between + POWER8 and POWER8E. This patch addresses that issue. + + Signed-off-by: Maynard Johnson + +diff --git a/libop/op_cpu_type.c b/libop/op_cpu_type.c +index 2907f36..1ae2913 100644 +--- a/libop/op_cpu_type.c ++++ b/libop/op_cpu_type.c +@@ -331,6 +331,9 @@ static op_cpu _get_ppc64_cpu_type(void) + + if (strncmp(cpu_name_lowercase, "power7+", 7) == 0) + cpu_name_lowercase[6] = '\0'; ++ if (strncmp(cpu_name_lowercase, "power8e", 7) == 0) ++ cpu_name_lowercase[6] = '\0'; ++ + cpu_type_str[0] = '\0'; + strcat(cpu_type_str, "ppc64/"); + strncat(cpu_type_str, cpu_name_lowercase, len); diff --git a/SOURCES/oprofile-ppc64jvm.patch b/SOURCES/oprofile-ppc64jvm.patch new file mode 100644 index 0000000..e870335 --- /dev/null +++ b/SOURCES/oprofile-ppc64jvm.patch @@ -0,0 +1,63 @@ +commit a41b4231ccfc83fb99271507a8e98f84a348e71d +Author: Rei Odaira +Date: Fri May 22 15:34:50 2015 -0400 + + Filter out zero-sized mapping to avoid opjitconv running indefinitely + + I found opjitconv ran indefinitely when profiling a Java application + running on OpenJDK/ppc64le. This is because OpenJDK sometimes reports + generation of zero-size jitted code via JVMTI, but scan_overlaps() in + opjitconv does not assume the existence of jitted code with size zero. + + (1) scan_overlaps() finds overlap between a normal jitted code and a + zero-size jitted code. + (2) eliminate_overlaps() tries to split the zero-size jitted code but + cannot. + (3) resolve_overlaps() incorrectly thinks the split has happened and + invokes scan_overlaps() again. + (4) Back to (1) + + One solution is to remove all the zero-size entries before resolving + overlaps which is implemented by this patch. + + Signed-off-by: William Cohen + +diff --git a/opjitconv/jitsymbol.c b/opjitconv/jitsymbol.c +index e2b1e66..1b980af 100644 +--- a/opjitconv/jitsymbol.c ++++ b/opjitconv/jitsymbol.c +@@ -201,6 +201,26 @@ static void invalidate_earlybirds(unsigned long long start_time) + } + } + ++static void invalidate_zero_size_entries(void) ++{ ++ u32 i; ++ int flag; ++ struct jitentry * a; ++ ++ flag = 0; ++ for (i = 0; i < entry_count; i++) { ++ a = entries_address_ascending[i]; ++ if (a->code_size == 0) { ++ invalidate_entry(a); ++ flag = 1; ++ } ++ } ++ if (flag) { ++ resort_address(); ++ resort_symbol(); ++ } ++} ++ + + /* select the symbol with the longest life time in the index range */ + static int select_one(int start_idx, int end_idx) +@@ -505,6 +525,7 @@ int resolve_overlaps(unsigned long long start_time) + int cnt = 0; + + invalidate_earlybirds(start_time); ++ invalidate_zero_size_entries(); + while ((rc = scan_overlaps()) && rc != OP_JIT_CONV_FAIL) { + resort_address(); + if (cnt == 0) { diff --git a/SOURCES/oprofile-ppc64le.patch b/SOURCES/oprofile-ppc64le.patch new file mode 100644 index 0000000..31dd258 --- /dev/null +++ b/SOURCES/oprofile-ppc64le.patch @@ -0,0 +1,50 @@ +commit a265c549bff149f5e9064dca7d06b6689fb3d64e +Author: Maynard Johnson +Date: Thu Jan 9 15:47:09 2014 -0600 + + Enable oprofile for new ppc64le architecture + + Signed-off-by: Maynard Johnson + +diff --git a/configure.ac b/configure.ac +index 457145a..1e3a65f 100644 +--- a/configure.ac ++++ b/configure.ac +@@ -184,7 +184,7 @@ AC_DEFINE_UNQUOTED(HAVE_PERF_EVENTS, $HAVE_PERF_EVENTS, [Kernel support for perf + AC_CANONICAL_HOST + if test "$HAVE_PERF_EVENTS" = "1"; then + PFM_LIB= +- if test "$host_cpu" = "powerpc64"; then ++ if test "$host_cpu" = "powerpc64le" -o "$host_cpu" = "powerpc64"; then + AC_CHECK_HEADER(perfmon/pfmlib.h,,[AC_MSG_ERROR([pfmlib.h not found; usually provided in papi devel package])]) + AC_CHECK_LIB(pfm,pfm_get_os_event_encoding, HAVE_LIBPFM3='0'; HAVE_LIBPFM='1', [ + AC_CHECK_LIB(pfm, pfm_get_event_name, HAVE_LIBPFM3='1'; HAVE_LIBPFM='1', +diff --git a/libop/op_cpu_type.c b/libop/op_cpu_type.c +index 7d5262c..15c71ab 100644 +--- a/libop/op_cpu_type.c ++++ b/libop/op_cpu_type.c +@@ -621,7 +621,8 @@ static op_cpu __get_cpu_type_alt_method(void) + fnmatch("i?86", uname_info.machine, 0) == 0) { + return _get_x86_64_cpu_type(); + } +- if (strncmp(uname_info.machine, "ppc64", 5) == 0) { ++ if ((strncmp(uname_info.machine, "ppc64", 5) == 0) || ++ (strncmp(uname_info.machine, "ppc64le", 7) == 0)) { + return _get_ppc64_cpu_type(); + } + if (strncmp(uname_info.machine, "arm", 3) == 0 || +diff --git a/libutil++/bfd_support.cpp b/libutil++/bfd_support.cpp +index 67edd09..4b744f8 100644 +--- a/libutil++/bfd_support.cpp ++++ b/libutil++/bfd_support.cpp +@@ -634,9 +634,7 @@ void bfd_info::translate_debuginfo_syms(asymbol ** dbg_syms, long nr_dbg_syms) + bool bfd_info::get_synth_symbols() + { + extern const bfd_target bfd_elf64_powerpc_vec; +- extern const bfd_target bfd_elf64_powerpcle_vec; +- bool is_elf64_powerpc_target = (abfd->xvec == &bfd_elf64_powerpc_vec) +- || (abfd->xvec == &bfd_elf64_powerpcle_vec); ++ bool is_elf64_powerpc_target = (abfd->xvec == &bfd_elf64_powerpc_vec); + + if (!is_elf64_powerpc_target) + return false; diff --git a/SOURCES/oprofile-remap.patch b/SOURCES/oprofile-remap.patch new file mode 100644 index 0000000..1252d79 --- /dev/null +++ b/SOURCES/oprofile-remap.patch @@ -0,0 +1,60 @@ +commit 1c54c9a3d96dd8d9d1d579baaeabc94d0f923ee8 +Author: William Cohen +Date: Fri Jul 10 15:41:33 2015 -0400 + + Improve handling of remapped anon regions across processes + + Java runtime environments use dynamically allocated memory in + anonymous regions to store Just-In-Time translated code. The Java + runtime system may change the access permissions for portions of mmap + regions during execution and operf needs to be tolerant of those + change to a portion of the mmap. operf also needs to keep the anon + memory maps distinct between processes to avoid confusion about the + sizes of the memory regions. + + Signed-off-by: William Cohen + +diff --git a/libperf_events/operf_process_info.h b/libperf_events/operf_process_info.h +index f98591f..3138ffb 100644 +--- a/libperf_events/operf_process_info.h ++++ b/libperf_events/operf_process_info.h +@@ -25,6 +25,7 @@ struct operf_mmap { + u64 start_addr; + u64 end_addr; + u64 pgoff; ++ u32 pid; + bool is_anon_mapping; + bool is_hypervisor; + char filename[PATH_MAX]; +diff --git a/libperf_events/operf_utils.cpp b/libperf_events/operf_utils.cpp +index 90a0765..ff972d4 100644 +--- a/libperf_events/operf_utils.cpp ++++ b/libperf_events/operf_utils.cpp +@@ -275,7 +275,10 @@ static void __handle_mmap_event(event_t * event) + range = all_images_map.equal_range(image_basename); + for (it = range.first; it != range.second; it++) { + if (((strcmp((*it).second->filename, image_basename.c_str())) == 0) +- && ((*it).second->start_addr == event->mmap.start)) { ++ && ((*it).second->pid == 0 || (*it).second->pid == event->mmap.pid) ++ && ((*it).second->start_addr <= event->mmap.start ++ && ((*it).second->end_addr >= event->mmap.start + event->mmap.len))) ++ { + mapping = (*it).second; + break; + } +@@ -291,12 +294,15 @@ static void __handle_mmap_event(event_t * event) + */ + if (mapping->filename[0] == '[') { + mapping->is_anon_mapping = true; ++ mapping->pid = event->mmap.pid; + } else if ((strncmp(mapping->filename, "//anon", + strlen("//anon")) == 0)) { + mapping->is_anon_mapping = true; ++ mapping->pid = event->mmap.pid; + strcpy(mapping->filename, "anon"); + } else if ((strncmp(mapping->filename, "/anon_hugepage", + strlen("/anon_hugepage")) == 0)) { ++ mapping->pid = event->mmap.pid; + mapping->is_anon_mapping = true; + strcpy(mapping->filename, "anon"); + } diff --git a/SOURCES/oprofile-rhbz1121205.patch b/SOURCES/oprofile-rhbz1121205.patch new file mode 100644 index 0000000..5a14bff --- /dev/null +++ b/SOURCES/oprofile-rhbz1121205.patch @@ -0,0 +1,1244 @@ +commit ebde58121d34e30f57ab173bf425244ce0712d48 +Author: Maynard Johnson +Date: Wed Oct 9 13:12:21 2013 -0500 + + Converge operf and ocount utility functions + + When the ocount tool was developed, a number of utility + functions were needed that were very similar to operf utility + functions, with just minor changes. The decision was made at + the time to copy these functions into ocount and change them + as needed. To avoid dual maintenance on very similar functions, + we should converge the two tools to use one common set of utility + functions. The main reason for not doing so in the first place + was to make it easier to review ocount patches and not have to + look at operf changes at the same time. + + Signed-off-by: Maynard Johnson + +diff --git a/Makefile.am b/Makefile.am +index 293114b..2fe8d2f 100644 +--- a/Makefile.am ++++ b/Makefile.am +@@ -19,9 +19,9 @@ SUBDIRS = \ + events \ + doc \ + gui \ ++ libpe_utils \ + libperf_events \ + pe_profiling \ +- libpe_utils \ + pe_counting \ + agents + #### ATTENTION #### +diff --git a/libpe_utils/op_pe_utils.cpp b/libpe_utils/op_pe_utils.cpp +index dc9459e..b85d175 100644 +--- a/libpe_utils/op_pe_utils.cpp ++++ b/libpe_utils/op_pe_utils.cpp +@@ -52,7 +52,9 @@ extern op_cpu cpu_type; + + using namespace std; + +-static int _op_get_next_online_cpu(DIR * dir, struct dirent *entry) ++// Global functions ++ ++int op_pe_utils::op_get_next_online_cpu(DIR * dir, struct dirent *entry) + { + #define OFFLINE 0x30 + unsigned int cpu_num; +@@ -86,8 +88,6 @@ static int _op_get_next_online_cpu(DIR * dir, struct dirent *entry) + return cpu_num; + } + +-// Global functions +- + int op_pe_utils::op_get_sys_value(const char * filename) + { + char str[10]; +@@ -148,7 +148,7 @@ int op_pe_utils::op_get_cpu_for_perf_events_cap(void) + goto error; + } else { + struct dirent *entry = NULL; +- retval = _op_get_next_online_cpu(dir, entry); ++ retval = op_get_next_online_cpu(dir, entry); + closedir(dir); + } + } else { +@@ -310,40 +310,6 @@ int op_pe_utils::op_validate_app_name(char ** app, char ** save_appname) + + out: return rc; + } +-static int _get_next_online_cpu(DIR * dir, struct dirent *entry) +-{ +-#define OFFLINE 0x30 +- unsigned int cpu_num; +- char cpu_online_pathname[40]; +- int res; +- FILE * online; +- again: +- do { +- entry = readdir(dir); +- if (!entry) +- return -1; +- } while (entry->d_type != DT_DIR); +- +- res = sscanf(entry->d_name, "cpu%u", &cpu_num); +- if (res <= 0) +- goto again; +- +- errno = 0; +- snprintf(cpu_online_pathname, 40, "/sys/devices/system/cpu/cpu%u/online", cpu_num); +- if ((online = fopen(cpu_online_pathname, "r")) == NULL) { +- cerr << "Unable to open " << cpu_online_pathname << endl; +- if (errno) +- cerr << strerror(errno) << endl; +- return -1; +- } +- res = fgetc(online); +- fclose(online); +- if (res == OFFLINE) +- goto again; +- else +- return cpu_num; +-} +- + + set op_pe_utils::op_get_available_cpus(int max_num_cpus) + { +@@ -392,7 +358,7 @@ set op_pe_utils::op_get_available_cpus(int max_num_cpus) + if (all_cpus_avail) { + available_cpus.insert(cpu); + } else { +- real_cpu = _get_next_online_cpu(dir, entry); ++ real_cpu = op_get_next_online_cpu(dir, entry); + if (real_cpu < 0) { + err_msg = "Internal Error: Number of online cpus cannot be determined."; + rc = -1; +@@ -803,7 +769,8 @@ static bool convert_event_vals(vector * evt_vec) + + + +-void op_pe_utils::op_process_events_list(vector & passed_evts) ++void op_pe_utils::op_process_events_list(vector & passed_evts, ++ bool do_profiling, bool do_callgraph) + { + string cmd = OP_BINDIR; + +@@ -812,7 +779,9 @@ void op_pe_utils::op_process_events_list(vector & passed_evts) + << OP_MAX_EVENTS << "." << endl; + exit(EXIT_FAILURE); + } +- cmd += "/ophelp --check-events --ignore-count "; ++ cmd += "/ophelp --check-events "; ++ if (!do_profiling) ++ cmd += "--ignore-count "; + for (unsigned int i = 0; i < passed_evts.size(); i++) { + FILE * fp; + string full_cmd = cmd; +@@ -825,6 +794,8 @@ void op_pe_utils::op_process_events_list(vector & passed_evts) + event_spec = _handle_powerpc_event_spec(event_spec); + #endif + ++ if (do_callgraph) ++ full_cmd += " --callgraph=1 "; + full_cmd += event_spec; + fp = popen(full_cmd.c_str(), "r"); + if (fp == NULL) { +@@ -836,14 +807,21 @@ void op_pe_utils::op_process_events_list(vector & passed_evts) + pclose(fp); + cerr << "Error retrieving info for event " + << event_spec << endl; ++ if (do_callgraph) ++ cerr << "Note: When doing callgraph profiling, the sample count must be" ++ << endl << "15 times the minimum count value for the event." << endl; + exit(EXIT_FAILURE); + } + pclose(fp); + char * event_str = op_xstrndup(event_spec.c_str(), event_spec.length()); + operf_event_t event; + strncpy(event.name, strtok(event_str, ":"), OP_MAX_EVT_NAME_LEN - 1); ++ if (do_profiling) ++ event.count = atoi(strtok(NULL, ":")); ++ else ++ event.count = 0UL; + /* Event name is required in the event spec in order for +- * 'ophelp --check-events --ignore-count' to pass. But since unit mask ++ * 'ophelp --check-events' to pass. But since unit mask + * and domain control bits are optional, we need to ensure the result of + * strtok is valid. + */ +@@ -854,7 +832,6 @@ void op_pe_utils::op_process_events_list(vector & passed_evts) + int place = _OP_UM; + char * endptr = NULL; + event.evt_um = 0UL; +- event.count = 0UL; + event.no_kernel = 0; + event.no_user = 0; + event.throttled = false; +@@ -904,7 +881,7 @@ void op_pe_utils::op_process_events_list(vector & passed_evts) + #endif + } + +-void op_pe_utils::op_get_default_event(void) ++void op_pe_utils::op_get_default_event(bool do_callgraph) + { + operf_event_t dft_evt; + struct op_default_event_descr descr; +@@ -918,7 +895,18 @@ void op_pe_utils::op_get_default_event(void) + } + + memset(&dft_evt, 0, sizeof(dft_evt)); +- dft_evt.count = descr.count; ++ if (do_callgraph) { ++ struct op_event * _event; ++ op_events(cpu_type); ++ if ((_event = find_event_by_name(descr.name, 0, 0))) { ++ dft_evt.count = _event->min_count * CALLGRAPH_MIN_COUNT_SCALE; ++ } else { ++ cerr << "Error getting event info for " << descr.name << endl; ++ exit(EXIT_FAILURE); ++ } ++ } else { ++ dft_evt.count = descr.count; ++ } + dft_evt.evt_um = descr.um; + strncpy(dft_evt.name, descr.name, OP_MAX_EVT_NAME_LEN - 1); + _get_event_code(&dft_evt, cpu_type); +diff --git a/libpe_utils/op_pe_utils.h b/libpe_utils/op_pe_utils.h +index 400eed3..08b6fae 100644 +--- a/libpe_utils/op_pe_utils.h ++++ b/libpe_utils/op_pe_utils.h +@@ -18,11 +18,13 @@ + #include + + #include ++#include + + #include "op_cpu_type.h" + + #define OP_APPNAME_LEN 1024 + #define OP_MAX_EVENTS 24 ++#define CALLGRAPH_MIN_COUNT_SCALE 15 + + /* A macro to be used for ppc64 architecture-specific code. The '__powerpc__' macro + * is defined for both ppc64 and ppc32 architectures, so we must further qualify by +@@ -38,8 +40,10 @@ extern int op_check_perf_events_cap(bool use_cpu_minus_one); + extern int op_get_sys_value(const char * filename); + extern int op_get_cpu_for_perf_events_cap(void); + extern int op_validate_app_name(char ** app, char ** save_appname); +-extern void op_get_default_event(void); +-extern void op_process_events_list(std::vector & passed_evts); ++extern void op_get_default_event(bool do_callgraph); ++extern void op_process_events_list(std::vector & passed_evts, ++ bool do_profiling, bool do_callgraph); ++extern int op_get_next_online_cpu(DIR * dir, struct dirent *entry); + extern std::set op_get_available_cpus(int max_num_cpus); + } + +diff --git a/libperf_events/Makefile.am b/libperf_events/Makefile.am +index 7163610..cf5f434 100644 +--- a/libperf_events/Makefile.am ++++ b/libperf_events/Makefile.am +@@ -7,6 +7,7 @@ AM_CPPFLAGS = \ + -I ${top_srcdir}/libop \ + -I ${top_srcdir}/libdb \ + -I ${top_srcdir}/libperf_events \ ++ -I ${top_srcdir}/libpe_utils \ + @PERF_EVENT_FLAGS@ \ + @OP_CPPFLAGS@ + +diff --git a/libperf_events/operf_counter.cpp b/libperf_events/operf_counter.cpp +index b4cceaa..319e859 100644 +--- a/libperf_events/operf_counter.cpp ++++ b/libperf_events/operf_counter.cpp +@@ -31,6 +31,7 @@ + #include "operf_process_info.h" + #include "op_libiberty.h" + #include "operf_stats.h" ++#include "op_pe_utils.h" + + + using namespace std; +@@ -645,7 +646,7 @@ void operf_record::setup() + } else if (all_cpus_avail) { + real_cpu = cpu; + } else { +- real_cpu = op_get_next_online_cpu(dir, entry); ++ real_cpu = op_pe_utils::op_get_next_online_cpu(dir, entry); + if (real_cpu < 0) { + err_msg = "Internal Error: Number of online cpus cannot be determined."; + rc = -1; +diff --git a/libperf_events/operf_utils.cpp b/libperf_events/operf_utils.cpp +index 30e64d8..faed9a6 100644 +--- a/libperf_events/operf_utils.cpp ++++ b/libperf_events/operf_utils.cpp +@@ -65,161 +65,6 @@ static list unresolved_events; + static struct operf_transient trans; + static bool sfile_init_done; + +-/* Some architectures (e.g., ppc64) do not use the same event value (code) for oprofile +- * and for perf_events. The operf-record process requires event values that perf_events +- * understands, but the operf-read process requires oprofile event values. The purpose of +- * the following method is to map the operf-record event value to a value that +- * opreport can understand. +- */ +-#if PPC64_ARCH +-extern op_cpu cpu_type; +-#define NIL_CODE ~0U +- +-#if HAVE_LIBPFM3 +-static bool _get_codes_for_match(unsigned int pfm_idx, const char name[], +- vector * evt_vec) +-{ +- unsigned int num_events = evt_vec->size(); +- int tmp_code, ret; +- char evt_name[OP_MAX_EVT_NAME_LEN]; +- unsigned int events_converted = 0; +- for (unsigned int i = 0; i < num_events; i++) { +- operf_event_t event = (*evt_vec)[i]; +- if (event.evt_code != NIL_CODE) { +- events_converted++; +- continue; +- } +- memset(evt_name, 0, OP_MAX_EVT_NAME_LEN); +- if (!strcmp(event.name, "CYCLES")) { +- strcpy(evt_name ,"PM_CYC") ; +- } else if (strstr(event.name, "_GRP")) { +- string str = event.name; +- strncpy(evt_name, event.name, str.rfind("_GRP")); +- } else { +- strncpy(evt_name, event.name, strlen(event.name)); +- } +- if (strncmp(name, evt_name, OP_MAX_EVT_NAME_LEN)) +- continue; +- ret = pfm_get_event_code(pfm_idx, &tmp_code); +- if (ret != PFMLIB_SUCCESS) { +- string evt_name_str = event.name; +- string msg = "libpfm cannot find event code for " + evt_name_str + +- "; cannot continue"; +- throw runtime_error(msg); +- } +- event.evt_code = tmp_code; +- (*evt_vec)[i] = event; +- events_converted++; +- cverb << vrecord << "Successfully converted " << event.name << " to perf_event code " +- << hex << tmp_code << endl; +- } +- return (events_converted == num_events); +-} +-#else +-static bool _op_get_event_codes(vector * evt_vec) +-{ +- int ret, i; +- unsigned int num_events = evt_vec->size(); +- char evt_name[OP_MAX_EVT_NAME_LEN]; +- unsigned int events_converted = 0; +- uint64_t code[1]; +- +- typedef struct { +- uint64_t *codes; +- char **fstr; +- size_t size; +- int count; +- int idx; +- } pfm_raw_pmu_encode_t; +- +- pfm_raw_pmu_encode_t raw; +- raw.codes = code; +- raw.count = 1; +- raw.fstr = NULL; +- +- if (pfm_initialize() != PFM_SUCCESS) +- throw runtime_error("Unable to initialize libpfm; cannot continue"); +- +- for (unsigned int i = 0; i < num_events; i++) { +- operf_event_t event = (*evt_vec)[i]; +- if (event.evt_code != NIL_CODE) { +- events_converted++; +- continue; +- } +- memset(evt_name, 0, OP_MAX_EVT_NAME_LEN); +- if (!strcmp(event.name, "CYCLES")) { +- strcpy(evt_name ,"PM_CYC") ; +- } else if (strstr(event.name, "_GRP")) { +- string str = event.name; +- strncpy(evt_name, event.name, str.rfind("_GRP")); +- } else { +- strncpy(evt_name, event.name, strlen(event.name)); +- } +- +- memset(&raw, 0, sizeof(raw)); +- ret = pfm_get_os_event_encoding(evt_name, PFM_PLM3, PFM_OS_NONE, &raw); +- if (ret != PFM_SUCCESS) { +- string evt_name_str = event.name; +- string msg = "libpfm cannot find event code for " + evt_name_str + +- "; cannot continue"; +- throw runtime_error(msg); +- } +- +- event.evt_code = raw.codes[0]; +- (*evt_vec)[i] = event; +- events_converted++; +- cverb << vrecord << "Successfully converted " << event.name << " to perf_event code " +- << hex << event.evt_code << endl; +- } +- return (events_converted == num_events); +-} +-#endif +- +-bool OP_perf_utils::op_convert_event_vals(vector * evt_vec) +-{ +- unsigned int i, count; +- char name[256]; +- int ret; +- for (unsigned int i = 0; i < evt_vec->size(); i++) { +- operf_event_t event = (*evt_vec)[i]; +- if (cpu_type == CPU_PPC64_POWER7) { +- if (!strncmp(event.name, "PM_RUN_CYC", strlen("PM_RUN_CYC"))) { +- event.evt_code = 0x600f4; +- } else if (!strncmp(event.name, "PM_RUN_INST_CMPL", strlen("PM_RUN_INST_CMPL"))) { +- event.evt_code = 0x500fa; +- } else { +- event.evt_code = NIL_CODE; +- } +- } else { +- event.evt_code = NIL_CODE; +- } +- (*evt_vec)[i] = event; +- } +- +-#if HAVE_LIBPFM3 +- if (pfm_initialize() != PFMLIB_SUCCESS) +- throw runtime_error("Unable to initialize libpfm; cannot continue"); +- +- ret = pfm_get_num_events(&count); +- if (ret != PFMLIB_SUCCESS) +- throw runtime_error("Unable to use libpfm to obtain event code; cannot continue"); +- for(i =0 ; i < count; i++) +- { +- ret = pfm_get_event_name(i, name, 256); +- if (ret != PFMLIB_SUCCESS) +- continue; +- if (_get_codes_for_match(i, name, evt_vec)) +- break; +- } +- return (i != count); +-#else +- return _op_get_event_codes(evt_vec); +-#endif +-} +- +-#endif // PPC64_ARCH +- +- + static inline void update_trans_last(struct operf_transient * trans) + { + trans->last = trans->current; +@@ -1465,38 +1310,3 @@ void OP_perf_utils::op_get_kernel_event_data(struct mmap_data *md, operf_record + md->prev = old; + pc->data_tail = old; + } +- +- +-int OP_perf_utils::op_get_next_online_cpu(DIR * dir, struct dirent *entry) +-{ +-#define OFFLINE 0x30 +- unsigned int cpu_num; +- char cpu_online_pathname[40]; +- int res; +- FILE * online; +- again: +- do { +- entry = readdir(dir); +- if (!entry) +- return -1; +- } while (entry->d_type != DT_DIR); +- +- res = sscanf(entry->d_name, "cpu%u", &cpu_num); +- if (res <= 0) +- goto again; +- +- errno = 0; +- snprintf(cpu_online_pathname, 40, "/sys/devices/system/cpu/cpu%u/online", cpu_num); +- if ((online = fopen(cpu_online_pathname, "r")) == NULL) { +- cerr << "Unable to open " << cpu_online_pathname << endl; +- if (errno) +- cerr << strerror(errno) << endl; +- return -1; +- } +- res = fgetc(online); +- fclose(online); +- if (res == OFFLINE) +- goto again; +- else +- return cpu_num; +-} +diff --git a/libperf_events/operf_utils.h b/libperf_events/operf_utils.h +index 4c191fe..2a979e3 100644 +--- a/libperf_events/operf_utils.h ++++ b/libperf_events/operf_utils.h +@@ -87,8 +87,6 @@ int op_write_output(int output, void *buf, size_t size); + int op_write_event(event_t * event, u64 sample_type); + int op_read_from_stream(std::ifstream & is, char * buf, std::streamsize sz); + int op_mmap_trace_file(struct mmap_info & info, bool init); +-int op_get_next_online_cpu(DIR * dir, struct dirent *entry); +-bool op_convert_event_vals(std::vector * evt_vec); + void op_reprocess_unresolved_events(u64 sample_type, bool print_progress); + void op_release_resources(void); + } +diff --git a/pe_counting/ocount.cpp b/pe_counting/ocount.cpp +index 5a85c3f..db847ea 100644 +--- a/pe_counting/ocount.cpp ++++ b/pe_counting/ocount.cpp +@@ -720,9 +720,9 @@ static void process_args(int argc, char * const argv[]) + + if (ocount_options::evts.empty()) { + // Use default event +- op_pe_utils::op_get_default_event(); ++ op_pe_utils::op_get_default_event(false); + } else { +- op_pe_utils::op_process_events_list(ocount_options::evts); ++ op_pe_utils::op_process_events_list(ocount_options::evts, false, false); + } + cverb << vdebug << "Number of events passed is " << events.size() << endl; + return; +diff --git a/pe_profiling/Makefile.am b/pe_profiling/Makefile.am +index b27cbc7..8c232c4 100644 +--- a/pe_profiling/Makefile.am ++++ b/pe_profiling/Makefile.am +@@ -6,6 +6,7 @@ AM_CPPFLAGS = \ + -I ${top_srcdir}/libop \ + -I ${top_srcdir}/libutil++ \ + -I ${top_srcdir}/libperf_events \ ++ -I ${top_srcdir}/libpe_utils \ + @PERF_EVENT_FLAGS@ \ + @OP_CPPFLAGS@ + +@@ -15,7 +16,8 @@ AM_CXXFLAGS = @OP_CXXFLAGS@ + AM_LDFLAGS = @OP_LDFLAGS@ + + bin_PROGRAMS = operf +-operf_LDADD = ../libperf_events/libperf_events.a \ ++operf_LDADD = ../libperf_events/libperf_events.a \ ++ ../libpe_utils/libpe_utils.a \ + ../libutil++/libutil++.a \ + ../libdb/libodb.a \ + ../libop/libop.a \ +diff --git a/pe_profiling/operf.cpp b/pe_profiling/operf.cpp +index 3fec123..89e9c4b 100644 +--- a/pe_profiling/operf.cpp ++++ b/pe_profiling/operf.cpp +@@ -35,6 +35,7 @@ + #include + #include + #include "operf_utils.h" ++#include "op_pe_utils.h" + #include "op_libiberty.h" + #include "string_manip.h" + #include "cverb.h" +@@ -50,6 +51,7 @@ + #include "op_netburst.h" + + using namespace std; ++using namespace op_pe_utils; + + typedef enum END_CODE { + ALL_OK = 0, +@@ -73,11 +75,11 @@ uid_t my_uid; + bool no_vmlinux; + int kptr_restrict; + char * start_time_human_readable; ++std::vector events; ++ + + #define DEFAULT_OPERF_OUTFILE "operf.data" +-#define CALLGRAPH_MIN_COUNT_SCALE 15 + +-static char full_pathname[PATH_MAX]; + static char * app_name_SAVE = NULL; + static char ** app_args = NULL; + static pid_t jitconv_pid = -1; +@@ -88,7 +90,6 @@ static string samples_dir; + static bool startApp; + static string outputfile; + static char start_time_str[32]; +-static vector events; + static bool jit_conversion_running; + static void convert_sample_data(void); + static int sample_data_pipe[2]; +@@ -948,517 +949,6 @@ out: + } + + +-static int find_app_file_in_dir(const struct dirent * d) +-{ +- if (!strcmp(d->d_name, app_name)) +- return 1; +- else +- return 0; +-} +- +-static int get_PATH_based_pathname(char * path_holder, size_t n) +-{ +- int retval = -1; +- +- char * real_path = getenv("PATH"); +- char * path = (char *) xstrdup(real_path); +- char * segment = strtok(path, ":"); +- while (segment) { +- struct dirent ** namelist; +- int rc = scandir(segment, &namelist, find_app_file_in_dir, NULL); +- if (rc < 0) { +- if (errno != ENOENT) { +- cerr << strerror(errno) << endl; +- cerr << app_name << " cannot be found in your PATH." << endl; +- break; +- } +- } else if (rc == 1) { +- size_t applen = strlen(app_name); +- size_t dirlen = strlen(segment); +- +- if (applen + dirlen + 2 > n) { +- cerr << "Path segment " << segment +- << " prepended to the passed app name is too long" +- << endl; +- retval = -1; +- break; +- } +- +- if (!strcmp(segment, ".")) { +- if (getcwd(path_holder, PATH_MAX) == NULL) { +- retval = -1; +- cerr << "getcwd [3] failed when processing /" << app_name << " found via PATH. Aborting." +- << endl; +- break; +- } +- } else { +- strncpy(path_holder, segment, dirlen); +- } +- strcat(path_holder, "/"); +- strncat(path_holder, app_name, applen); +- retval = 0; +- free(namelist[0]); +- free(namelist); +- +- break; +- } +- segment = strtok(NULL, ":"); +- } +- free(path); +- return retval; +-} +-int validate_app_name(void) +-{ +- int rc = 0; +- struct stat filestat; +- size_t len = strlen(app_name); +- +- if (len > (size_t) (OP_APPNAME_LEN - 1)) { +- cerr << "app name longer than max allowed (" << OP_APPNAME_LEN +- << " chars)\n"; +- cerr << app_name << endl; +- rc = -1; +- goto out; +- } +- +- if (index(app_name, '/') == app_name) { +- // Full pathname of app was specified, starting with "/". +- strncpy(full_pathname, app_name, len); +- } else if ((app_name[0] == '.') && (app_name[1] == '/')) { +- // Passed app is in current directory; e.g., "./myApp" +- if (getcwd(full_pathname, PATH_MAX) == NULL) { +- rc = -1; +- cerr << "getcwd [1] failed when trying to find app name " << app_name << ". Aborting." +- << endl; +- goto out; +- } +- strcat(full_pathname, "/"); +- if ((strlen(full_pathname) + strlen(app_name + 2) + 1) > PATH_MAX) { +- rc = -1; +- cerr << "Length of current dir (" << full_pathname << ") and app name (" +- << (app_name + 2) << ") exceeds max allowed (" << PATH_MAX << "). Aborting." +- << endl; +- goto out; +- } +- strcat(full_pathname, (app_name + 2)); +- } else if (index(app_name, '/')) { +- // Passed app is in a subdirectory of cur dir; e.g., "test-stuff/myApp" +- if (getcwd(full_pathname, PATH_MAX) == NULL) { +- rc = -1; +- cerr << "getcwd [2] failed when trying to find app name " << app_name << ". Aborting." +- << endl; +- goto out; +- } +- strcat(full_pathname, "/"); +- strcat(full_pathname, app_name); +- } else { +- // Passed app name, at this point, MUST be found in PATH +- rc = get_PATH_based_pathname(full_pathname, PATH_MAX); +- } +- +- if (rc) { +- cerr << "Problem finding app name " << app_name << ". Aborting." +- << endl; +- goto out; +- } +- app_name_SAVE = app_name; +- app_name = full_pathname; +- if (stat(app_name, &filestat)) { +- char msg[OP_APPNAME_LEN + 50]; +- snprintf(msg, OP_APPNAME_LEN + 50, "Non-existent app name \"%s\"", +- app_name); +- perror(msg); +- rc = -1; +- } +- +- out: return rc; +-} +- +-static void _get_event_code(operf_event_t * event) +-{ +- FILE * fp; +- char oprof_event_code[9]; +- string command; +- u64 base_code, config; +- char buf[20]; +- if ((snprintf(buf, 20, "%lu", event->count)) < 0) { +- cerr << "Error parsing event count of " << event->count << endl; +- exit(EXIT_FAILURE); +- } +- +- base_code = config = 0ULL; +- +- command = OP_BINDIR; +- command += "ophelp "; +- command += event->name; +- +- fp = popen(command.c_str(), "r"); +- if (fp == NULL) { +- cerr << "Unable to execute ophelp to get info for event " +- << event->name << endl; +- exit(EXIT_FAILURE); +- } +- if (fgets(oprof_event_code, sizeof(oprof_event_code), fp) == NULL) { +- pclose(fp); +- cerr << "Unable to find info for event " +- << event->name << endl; +- exit(EXIT_FAILURE); +- } +- +- pclose(fp); +- +- base_code = strtoull(oprof_event_code, (char **) NULL, 10); +- +- +-#if defined(__i386__) || defined(__x86_64__) +- // Setup EventSelct[11:8] field for AMD +- char mask[12]; +- const char * vendor_AMD = "AuthenticAMD"; +- if (op_is_cpu_vendor((char *)vendor_AMD)) { +- config = base_code & 0xF00ULL; +- config = config << 32; +- } +- +- // Setup EventSelct[7:0] field +- config |= base_code & 0xFFULL; +- +- // Setup unitmask field +-handle_named_um: +- if (event->um_name[0]) { +- command = OP_BINDIR; +- command += "ophelp "; +- command += "--extra-mask "; +- command += event->name; +- command += ":"; +- command += buf; +- command += ":"; +- command += event->um_name; +- fp = popen(command.c_str(), "r"); +- if (fp == NULL) { +- cerr << "Unable to execute ophelp to get info for event " +- << event->name << endl; +- exit(EXIT_FAILURE); +- } +- if (fgets(mask, sizeof(mask), fp) == NULL) { +- pclose(fp); +- cerr << "Unable to find unit mask info for " << event->um_name << " for event " +- << event->name << endl; +- exit(EXIT_FAILURE); +- } +- pclose(fp); +- // FIXME: The mask value here is the extra bits from the named unit mask. It's not +- // ideal to put that value into the UM's mask, since that's what will show up in +- // opreport. It would be better if we could somehow have the unit mask name that the +- // user passed to us show up in opreort. +- event->evt_um = strtoull(mask, (char **) NULL, 10); +- /* A value >= EXTRA_MIN_VAL returned by 'ophelp --extra-mask' is interpreted as a +- * valid extra value; otherwise we interpret it as a simple unit mask value +- * for a named unit mask with EXTRA_NONE. +- */ +- if (event->evt_um >= EXTRA_MIN_VAL) +- config |= event->evt_um; +- else +- config |= ((event->evt_um & 0xFFULL) << 8); +- } else if (!event->evt_um) { +- char * endptr; +- command.clear(); +- command = OP_BINDIR; +- command += "ophelp "; +- command += "--unit-mask "; +- command += event->name; +- command += ":"; +- command += buf; +- fp = popen(command.c_str(), "r"); +- if (fp == NULL) { +- cerr << "Unable to execute ophelp to get unit mask for event " +- << event->name << endl; +- exit(EXIT_FAILURE); +- } +- if (fgets(mask, sizeof(mask), fp) == NULL) { +- pclose(fp); +- cerr << "Unable to find unit mask info for event " << event->name << endl; +- exit(EXIT_FAILURE); +- } +- pclose(fp); +- event->evt_um = strtoull(mask, &endptr, 10); +- if ((endptr >= mask) && +- (endptr <= (mask + strlen(mask) - 1))) { +- // Must be a default named unit mask +- strncpy(event->um_name, mask, OP_MAX_UM_NAME_LEN); +- goto handle_named_um; +- } +- config |= ((event->evt_um & 0xFFULL) << 8); +- } else { +- config |= ((event->evt_um & 0xFFULL) << 8); +- } +-#else +- config = base_code; +-#endif +- +- event->op_evt_code = base_code; +- if (cpu_type == CPU_P4 || cpu_type == CPU_P4_HT2) { +- if (op_netburst_get_perf_encoding(event->name, event->evt_um, 1, 1, &config)) { +- cerr << "Unable to get event encoding for " << event->name << endl; +- exit(EXIT_FAILURE); +- } +- } +- event->evt_code = config; +-} +- +-#if PPC64_ARCH +-/* All ppc64 events (except CYCLES) have a _GRP suffix. This is +- * because the legacy opcontrol profiler can only profile events in +- * the same group (i.e., having the same _GRP suffix). But operf +- * can multiplex events, so we should allow the user to pass event +- * names without the _GRP suffix. +- * +- * If event name is not CYCLES or does not have a _GRP suffix, +- * we'll call ophelp and scan the list of events, searching for one +- * that matches up to the _GRP suffix. If we don't find a match, +- * then we'll exit with the expected error message for invalid event name. +- */ +-static string _handle_powerpc_event_spec(string event_spec) +-{ +- FILE * fp; +- char line[MAX_INPUT]; +- size_t grp_pos; +- string evt, retval, err_msg; +- size_t evt_name_len; +- bool first_non_cyc_evt_found = false; +- bool event_found = false; +- char event_name[OP_MAX_EVT_NAME_LEN], event_spec_str[OP_MAX_EVT_NAME_LEN + 20], * count_str; +- string cmd = OP_BINDIR; +- cmd += "/ophelp"; +- +- strncpy(event_spec_str, event_spec.c_str(), event_spec.length() + 1); +- +- strncpy(event_name, strtok(event_spec_str, ":"), OP_MAX_EVT_NAME_LEN); +- count_str = strtok(NULL, ":"); +- if (!count_str) { +- err_msg = "Invalid count for event "; +- goto out; +- } +- +- if (!strcmp("CYCLES", event_name)) { +- event_found = true; +- goto out; +- } +- +- evt = event_name; +- // Need to make sure the event name truly has a _GRP suffix. +- grp_pos = evt.rfind("_GRP"); +- if ((grp_pos != string::npos) && ((evt = evt.substr(grp_pos, string::npos))).length() > 4) { +- char * end; +- strtoul(evt.substr(4, string::npos).c_str(), &end, 0); +- if (end && (*end == '\0')) { +- // Valid group number found after _GRP, so we can skip to the end. +- event_found = true; +- goto out; +- } +- } +- +- // If we get here, it implies the user passed a non-CYCLES event without a GRP suffix. +- // Lets try to find a valid suffix for it. +- fp = popen(cmd.c_str(), "r"); +- if (fp == NULL) { +- cerr << "Unable to execute ophelp to get info for event " +- << event_spec << endl; +- exit(EXIT_FAILURE); +- } +- evt_name_len = strlen(event_name); +- err_msg = "Cannot find event "; +- while (fgets(line, MAX_INPUT, fp)) { +- if (!first_non_cyc_evt_found) { +- if (!strncmp(line, "PM_", 3)) +- first_non_cyc_evt_found = true; +- else +- continue; +- } +- if (line[0] == ' ' || line[0] == '\t') +- continue; +- if (!strncmp(line, event_name, evt_name_len)) { +- // Found a potential match. Check if it's a perfect match. +- string save_event_name = event_name; +- size_t full_evt_len = index(line, ':') - line; +- memset(event_name, '\0', OP_MAX_EVT_NAME_LEN); +- strncpy(event_name, line, full_evt_len); +- string candidate = event_name; +- if (candidate.rfind("_GRP") == evt_name_len) { +- event_found = true; +- break; +- } else { +- memset(event_name, '\0', OP_MAX_EVT_NAME_LEN); +- strncpy(event_name, save_event_name.c_str(), evt_name_len); +- } +- } +- } +- pclose(fp); +- +-out: +- if (!event_found) { +- cerr << err_msg << event_name << endl; +- cerr << "Error retrieving info for event " +- << event_spec << endl; +- exit(EXIT_FAILURE); +- } +- retval = event_name; +- return retval + ":" + count_str; +-} +-#endif +- +-static void _process_events_list(void) +-{ +- string cmd = OP_BINDIR; +- if (operf_options::evts.size() > OP_MAX_EVENTS) { +- cerr << "Number of events specified is greater than allowed maximum of " +- << OP_MAX_EVENTS << "." << endl; +- exit(EXIT_FAILURE); +- } +- cmd += "/ophelp --check-events "; +- for (unsigned int i = 0; i < operf_options::evts.size(); i++) { +- FILE * fp; +- string full_cmd = cmd; +- string event_spec = operf_options::evts[i]; +- +-#if PPC64_ARCH +- // Starting with CPU_PPC64_ARCH_V1, ppc64 events files are formatted like +- // other architectures, so no special handling is needed. +- if (cpu_type < CPU_PPC64_ARCH_V1) +- event_spec = _handle_powerpc_event_spec(event_spec); +-#endif +- +- if (operf_options::callgraph) { +- full_cmd += " --callgraph=1 "; +- } +- full_cmd += event_spec; +- fp = popen(full_cmd.c_str(), "r"); +- if (fp == NULL) { +- cerr << "Unable to execute ophelp to get info for event " +- << event_spec << endl; +- exit(EXIT_FAILURE); +- } +- if (fgetc(fp) == EOF) { +- pclose(fp); +- cerr << "Error retrieving info for event " +- << event_spec << endl; +- if (operf_options::callgraph) +- cerr << "Note: When doing callgraph profiling, the sample count must be" +- << endl << "15 times the minimum count value for the event." << endl; +- exit(EXIT_FAILURE); +- } +- pclose(fp); +- char * event_str = op_xstrndup(event_spec.c_str(), event_spec.length()); +- operf_event_t event; +- strncpy(event.name, strtok(event_str, ":"), OP_MAX_EVT_NAME_LEN - 1); +- event.count = atoi(strtok(NULL, ":")); +- /* Name and count are required in the event spec in order for +- * 'ophelp --check-events' to pass. But since unit mask and domain +- * control bits are optional, we need to ensure the result of strtok +- * is valid. +- */ +- char * info; +-#define _OP_UM 1 +-#define _OP_KERNEL 2 +-#define _OP_USER 3 +- int place = _OP_UM; +- char * endptr = NULL; +- event.evt_um = 0ULL; +- event.no_kernel = 0; +- event.no_user = 0; +- event.throttled = false; +- memset(event.um_name, '\0', OP_MAX_UM_NAME_LEN); +- while ((info = strtok(NULL, ":"))) { +- switch (place) { +- case _OP_UM: +- event.evt_um = strtoul(info, &endptr, 0); +- // If any of the UM part is not a number, then we +- // consider the entire part a string. +- if (*endptr) { +- event.evt_um = 0; +- strncpy(event.um_name, info, OP_MAX_UM_NAME_LEN - 1); +- } +- break; +- case _OP_KERNEL: +- if (atoi(info) == 0) +- event.no_kernel = 1; +- break; +- case _OP_USER: +- if (atoi(info) == 0) +- event.no_user = 1; +- break; +- } +- place++; +- } +- free(event_str); +- _get_event_code(&event); +- events.push_back(event); +- } +-#if PPC64_ARCH +- { +- /* For ppc64 architecture processors prior to the introduction of +- * architected_events_v1, the oprofile event code needs to be converted +- * to the appropriate event code to pass to the perf_event_open syscall. +- * But as of the introduction of architected_events_v1, the events +- * file contains the necessary event code information, so this conversion +- * step is no longer needed. +- */ +- +- using namespace OP_perf_utils; +- if ((cpu_type < CPU_PPC64_ARCH_V1) && !op_convert_event_vals(&events)) { +- cerr << "Unable to convert all oprofile event values to perf_event values" << endl; +- exit(EXIT_FAILURE); +- } +- } +-#endif +-} +- +-static void get_default_event(void) +-{ +- operf_event_t dft_evt; +- struct op_default_event_descr descr; +- vector tmp_events; +- +- +- op_default_event(cpu_type, &descr); +- if (descr.name[0] == '\0') { +- cerr << "Unable to find default event" << endl; +- exit(EXIT_FAILURE); +- } +- +- memset(&dft_evt, 0, sizeof(dft_evt)); +- if (operf_options::callgraph) { +- struct op_event * _event; +- op_events(cpu_type); +- if ((_event = find_event_by_name(descr.name, 0, 0))) { +- dft_evt.count = _event->min_count * CALLGRAPH_MIN_COUNT_SCALE; +- } else { +- cerr << "Error getting event info for " << descr.name << endl; +- exit(EXIT_FAILURE); +- } +- } else { +- dft_evt.count = descr.count; +- } +- dft_evt.evt_um = descr.um; +- strncpy(dft_evt.name, descr.name, OP_MAX_EVT_NAME_LEN - 1); +- _get_event_code(&dft_evt); +- events.push_back(dft_evt); +- +-#if PPC64_ARCH +- { +- /* This section of code is for architectures such as ppc[64] for which +- * the oprofile event code needs to be converted to the appropriate event +- * code to pass to the perf_event_open syscall. +- */ +- +- using namespace OP_perf_utils; +- if ((cpu_type < CPU_PPC64_ARCH_V1) && !op_convert_event_vals(&events)) { +- cerr << "Unable to convert all oprofile event values to perf_event values" << endl; +- exit(EXIT_FAILURE); +- } +- } +-#endif +-} +- + static void _process_session_dir(void) + { + if (operf_options::session_dir.empty()) { +@@ -1752,7 +1242,7 @@ static void process_args(int argc, char * const argv[]) + app_args = (char **) xmalloc((sizeof *app_args) * 2); + app_args[1] = NULL; + } +- if (validate_app_name() < 0) { ++ if (op_validate_app_name(&app_name, &app_name_SAVE) < 0) { + __print_usage_and_exit(NULL); + } + } else { // non_options_idx == 0 +@@ -1783,9 +1273,9 @@ static void process_args(int argc, char * const argv[]) + + if (operf_options::evts.empty()) { + // Use default event +- get_default_event(); ++ op_get_default_event(operf_options::callgraph); + } else { +- _process_events_list(); ++ op_process_events_list(operf_options::evts, true, operf_options::callgraph); + } + op_nr_events = events.size(); + +@@ -1800,87 +1290,6 @@ static void process_args(int argc, char * const argv[]) + return; + } + +-static int _get_cpu_for_perf_events_cap(void) +-{ +- int retval; +- string err_msg; +- char cpus_online[257]; +- FILE * online_cpus; +- DIR *dir = NULL; +- +- int total_cpus = sysconf(_SC_NPROCESSORS_ONLN); +- if (!total_cpus) { +- err_msg = "Internal Error (1): Number of online cpus cannot be determined."; +- retval = -1; +- goto error; +- } +- +- online_cpus = fopen("/sys/devices/system/cpu/online", "r"); +- if (!online_cpus) { +- err_msg = "Internal Error (2): Number of online cpus cannot be determined."; +- retval = -1; +- goto error; +- } +- memset(cpus_online, 0, sizeof(cpus_online)); +- +- if ( fgets(cpus_online, sizeof(cpus_online), online_cpus) == NULL) { +- fclose(online_cpus); +- err_msg = "Internal Error (3): Number of online cpus cannot be determined."; +- retval = -1; +- goto error; +- } +- +- if (!cpus_online[0]) { +- fclose(online_cpus); +- err_msg = "Internal Error (4): Number of online cpus cannot be determined."; +- retval = -1; +- goto error; +- +- } +- if (index(cpus_online, ',') || cpus_online[0] != '0') { +- // A comma in cpus_online implies a gap, which in turn implies that not all +- // CPUs are online. +- if ((dir = opendir("/sys/devices/system/cpu")) == NULL) { +- fclose(online_cpus); +- err_msg = "Internal Error (5): Number of online cpus cannot be determined."; +- retval = -1; +- goto error; +- } else { +- struct dirent *entry = NULL; +- retval = OP_perf_utils::op_get_next_online_cpu(dir, entry); +- closedir(dir); +- } +- } else { +- // All CPUs are available, so we just arbitrarily choose CPU 0. +- retval = 0; +- } +- fclose(online_cpus); +-error: +- return retval; +-} +- +- +-static int _check_perf_events_cap(bool use_cpu_minus_one) +-{ +- /* If perf_events syscall is not implemented, the syscall below will fail +- * with ENOSYS (38). If implemented, but the processor type on which this +- * program is running is not supported by perf_events, the syscall returns +- * ENOENT (2). +- */ +- struct perf_event_attr attr; +- pid_t pid ; +- int cpu_to_try = use_cpu_minus_one ? -1 : _get_cpu_for_perf_events_cap(); +- errno = 0; +- memset(&attr, 0, sizeof(attr)); +- attr.size = sizeof(attr); +- attr.sample_type = PERF_SAMPLE_IP; +- +- pid = getpid(); +- syscall(__NR_perf_event_open, &attr, pid, cpu_to_try, -1, 0); +- return errno; +- +-} +- + static void _precheck_permissions_to_samplesdir(string sampledir, bool for_current) + { + /* Pre-check to make sure we have permission to remove old sample data +@@ -1911,28 +1320,14 @@ static void _precheck_permissions_to_samplesdir(string sampledir, bool for_curre + + } + +-static int _get_sys_value(const char * filename) +-{ +- char str[10]; +- int _val = -999; +- FILE * fp = fopen(filename, "r"); +- if (fp == NULL) +- return _val; +- if (fgets(str, 9, fp)) +- sscanf(str, "%d", &_val); +- fclose(fp); +- return _val; +-} +- +- + int main(int argc, char * const argv[]) + { + int rc; +- int perf_event_paranoid = _get_sys_value("/proc/sys/kernel/perf_event_paranoid"); ++ int perf_event_paranoid = op_get_sys_value("/proc/sys/kernel/perf_event_paranoid"); + + my_uid = geteuid(); + throttled = false; +- rc = _check_perf_events_cap(use_cpu_minus_one); ++ rc = op_check_perf_events_cap(use_cpu_minus_one); + if (rc == EACCES) { + /* Early perf_events kernels required the cpu argument to perf_event_open + * to be '-1' when setting up to profile a single process if 1) the user is +@@ -1948,7 +1343,7 @@ int main(int argc, char * const argv[]) + */ + if (my_uid != 0 && perf_event_paranoid > 0) { + use_cpu_minus_one = true; +- rc = _check_perf_events_cap(use_cpu_minus_one); ++ rc = op_check_perf_events_cap(use_cpu_minus_one); + } + } + if (rc == EBUSY) { +@@ -1996,7 +1391,7 @@ int main(int argc, char * const argv[]) + _precheck_permissions_to_samplesdir(previous_sampledir, for_current); + } + } +- kptr_restrict = _get_sys_value("/proc/sys/kernel/kptr_restrict"); ++ kptr_restrict = op_get_sys_value("/proc/sys/kernel/kptr_restrict"); + end_code_t run_result; + if ((run_result = _run())) { + if (startApp && app_started && (run_result != APP_ABNORMAL_END)) { diff --git a/SOURCES/oprofile-rhbz1385007.patch b/SOURCES/oprofile-rhbz1385007.patch new file mode 100644 index 0000000..1d0477a --- /dev/null +++ b/SOURCES/oprofile-rhbz1385007.patch @@ -0,0 +1,1378 @@ +commit 09bf741f18ae5830156ffbf3e33e933145601b06 +Author: Maynard Johnson +Date: Tue Nov 4 11:06:41 2014 -0600 + + Lower the minimum count value for most marked events for POWER8 + + With the IBM POWER8 processor, marked events occur at a substantially + lower rate than with previous IBM Power processors. This patch adjusts + the minimum count value downwards on the marked events used by POWER8 + in order to try to attain a reasonable sampling rate. + + Signed-off-by: Maynard Johnson + +diff --git a/events/ppc64/architected_events_v1/events b/events/ppc64/architected_events_v1/events +index a52d9ee..eef5b42 100644 +--- a/events/ppc64/architected_events_v1/events ++++ b/events/ppc64/architected_events_v1/events +@@ -30,20 +30,20 @@ event:0x300f6 counters:2 um:zero minimum:10000 name:PM_L1_DCACHE_RELOAD_VALID : + event:0x200fd counters:1 um:zero minimum:10000 name:PM_L1_ICACHE_MISS : Demand iCache Miss. + event:0x3e054 counters:2 um:zero minimum:10000 name:PM_LD_MISS_L1 : Load Missed L1. + event:0x200f6 counters:1 um:zero minimum:10000 name:PM_LSU_DERAT_MISS : DERAT Reloaded (Miss). +-event:0x301e4 counters:2 um:zero minimum:1000 name:PM_MRK_BR_MPRED_CMPL : Marked Branch Mispredicted. +-event:0x101e2 counters:0 um:zero minimum:1000 name:PM_MRK_BR_TAKEN_CMPL : Marked Branch Taken. +-event:0x401e8 counters:3 um:zero minimum:1000 name:PM_MRK_DATA_FROM_L2MISS : Data cache reload L2 miss. +-event:0x201e4 counters:1 um:zero minimum:1000 name:PM_MRK_DATA_FROM_L3MISS : The processor's data cache was reloaded from a localtion other than the local core's L3 due to a marked load. +-event:0x201e0 counters:1 um:zero minimum:1000 name:PM_MRK_DATA_FROM_MEM : The processor's data cache was reloaded from a memory location including L4 from local remote or distant due to a marked load. +-event:0x301e6 counters:2 um:zero minimum:1000 name:PM_MRK_DERAT_MISS : Erat Miss (TLB Access) All page sizes. +-event:0x401e4 counters:3 um:zero minimum:1000 name:PM_MRK_DTLB_MISS : Marked dtlb miss. +-event:0x401e0 counters:3 um:zero minimum:1000 name:PM_MRK_INST_CMPL : marked instruction completed. +-event:0x101e0 counters:0 um:zero minimum:1000 name:PM_MRK_INST_DISP : Marked Instruction dispatched. +-event:0x401e6 counters:3 um:zero minimum:1000 name:PM_MRK_INST_FROM_L3MISS : n/a +-event:0x101e4 counters:0 um:zero minimum:1000 name:PM_MRK_L1_ICACHE_MISS : Marked L1 Icache Miss. +-event:0x101ea counters:0 um:zero minimum:1000 name:PM_MRK_L1_RELOAD_VALID : Marked demand reload. +-event:0x201e2 counters:1 um:zero minimum:1000 name:PM_MRK_LD_MISS_L1 : Marked DL1 Demand Miss counted at exec time. +-event:0x10134 counters:0 um:zero minimum:1000 name:PM_MRK_ST_CMPL : Marked store completed. ++event:0x301e4 counters:2 um:zero minimum:100 name:PM_MRK_BR_MPRED_CMPL : Marked Branch Mispredicted. ++event:0x101e2 counters:0 um:zero minimum:100 name:PM_MRK_BR_TAKEN_CMPL : Marked Branch Taken. ++event:0x401e8 counters:3 um:zero minimum:100 name:PM_MRK_DATA_FROM_L2MISS : Data cache reload L2 miss. ++event:0x201e4 counters:1 um:zero minimum:100 name:PM_MRK_DATA_FROM_L3MISS : The processor's data cache was reloaded from a localtion other than the local core's L3 due to a marked load. ++event:0x201e0 counters:1 um:zero minimum:100 name:PM_MRK_DATA_FROM_MEM : The processor's data cache was reloaded from a memory location including L4 from local remote or distant due to a marked load. ++event:0x301e6 counters:2 um:zero minimum:100 name:PM_MRK_DERAT_MISS : Erat Miss (TLB Access) All page sizes. ++event:0x401e4 counters:3 um:zero minimum:100 name:PM_MRK_DTLB_MISS : Marked dtlb miss. ++event:0x401e0 counters:3 um:zero minimum:100 name:PM_MRK_INST_CMPL : marked instruction completed. ++event:0x101e0 counters:0 um:zero minimum:100 name:PM_MRK_INST_DISP : Marked Instruction dispatched. ++event:0x401e6 counters:3 um:zero minimum:100 name:PM_MRK_INST_FROM_L3MISS : n/a ++event:0x101e4 counters:0 um:zero minimum:100 name:PM_MRK_L1_ICACHE_MISS : Marked L1 Icache Miss. ++event:0x101ea counters:0 um:zero minimum:100 name:PM_MRK_L1_RELOAD_VALID : Marked demand reload. ++event:0x201e2 counters:1 um:zero minimum:100 name:PM_MRK_LD_MISS_L1 : Marked DL1 Demand Miss counted at exec time. ++event:0x10134 counters:0 um:zero minimum:100 name:PM_MRK_ST_CMPL : Marked store completed. + event:0x600f4 counters:5 um:zero minimum:100000 name:PM_RUN_CYC : Run_cycles. + event:0x500fa counters:4 um:zero minimum:100000 name:PM_RUN_INST_CMPL : Run_Instructions. + event:0x400f4 counters:3 um:zero minimum:10000 name:PM_RUN_PURR : Run_PURR. +diff --git a/events/ppc64/power8/events b/events/ppc64/power8/events +index 6e4e688..cc1163a 100644 +--- a/events/ppc64/power8/events ++++ b/events/ppc64/power8/events +@@ -508,7 +508,7 @@ event:0x83908a counters:2 um:zero minimum:10000 name:PM_L3_CO0_BUSY : lifetime, + event:0x28086 counters:1 um:zero minimum:10000 name:PM_L3_CO_L31 : L3 CO to L3.1 OR of port 0 and 1 ( lossy) + event:0x238088 counters:2 um:zero minimum:10000 name:PM_L3_CO_LCO : Total L3 castouts occurred on LCO + event:0x28084 counters:1 um:zero minimum:10000 name:PM_L3_CO_MEM : L3 CO to memory OR of port 0 and 1 ( lossy) +-event:0x18082 counters:0 um:zero minimum:10000 name:PM_L3_CO_MEPF : L3 CO of line in Mep state ( includes casthrough ++event:0x18082 counters:0 um:zero minimum:10000 name:PM_L3_CO_MEPF : L3 CO of line in Mep state ( includes casthrough) + event:0xb19082 counters:0 um:zero minimum:10000 name:PM_L3_GRP_GUESS_CORRECT : Initial scope=group and data from same group (near) (pred successful) + event:0xb3908a counters:2 um:zero minimum:10000 name:PM_L3_GRP_GUESS_WRONG_HIGH : Initial scope=group but data from local node. Predition too high + event:0xb39088 counters:2 um:zero minimum:10000 name:PM_L3_GRP_GUESS_WRONG_LOW : Initial scope=group but data from outside group (far or rem). Prediction too Low +@@ -692,163 +692,163 @@ event:0x1c05e counters:0 um:zero minimum:10000 name:PM_MEM_LOC_THRESH_LSU_MED : + event:0x2c058 counters:1 um:zero minimum:10000 name:PM_MEM_PREF : Memory prefetch for this lpar. + event:0x10056 counters:0 um:zero minimum:10000 name:PM_MEM_READ : Reads from Memory from this lpar (includes data/inst/xlate/l1prefetch/inst prefetch). + event:0x3c05e counters:2 um:zero minimum:10000 name:PM_MEM_RWITM : Memory rwitm for this lpar. +-event:0x3515e counters:2 um:zero minimum:1000 name:PM_MRK_BACK_BR_CMPL : Marked branch instruction completed with a target address less than current instruction address. +-event:0x2013a counters:1 um:zero minimum:1000 name:PM_MRK_BRU_FIN : bru marked instr finish. +-event:0x1016e counters:0 um:zero minimum:1000 name:PM_MRK_BR_CMPL : Branch Instruction completed. +-event:0x3013a counters:2 um:zero minimum:1000 name:PM_MRK_CRU_FIN : IFU non-branch marked instruction finished. +-event:0x4d148 counters:3 um:zero minimum:1000 name:PM_MRK_DATA_FROM_DL2L3_MOD : The processor's data cache was reloaded with Modified (M) data from another chip's L2 or L3 on a different Node or Group (Distant), as this chip due to a marked load. +-event:0x2d128 counters:1 um:zero minimum:1000 name:PM_MRK_DATA_FROM_DL2L3_MOD_CYC : Duration in cycles to reload with Modified (M) data from another chip's L2 or L3 on a different Node or Group (Distant), as this chip due to a marked load. +-event:0x3d148 counters:2 um:zero minimum:1000 name:PM_MRK_DATA_FROM_DL2L3_SHR : The processor's data cache was reloaded with Shared (S) data from another chip's L2 or L3 on a different Node or Group (Distant), as this chip due to a marked load. +-event:0x2c128 counters:1 um:zero minimum:1000 name:PM_MRK_DATA_FROM_DL2L3_SHR_CYC : Duration in cycles to reload with Shared (S) data from another chip's L2 or L3 on a different Node or Group (Distant), as this chip due to a marked load. +-event:0x3d14c counters:2 um:zero minimum:1000 name:PM_MRK_DATA_FROM_DL4 : The processor's data cache was reloaded from another chip's L4 on a different Node or Group (Distant) due to a marked load. +-event:0x2c12c counters:1 um:zero minimum:1000 name:PM_MRK_DATA_FROM_DL4_CYC : Duration in cycles to reload from another chip's L4 on a different Node or Group (Distant) due to a marked load. +-event:0x4d14c counters:3 um:zero minimum:1000 name:PM_MRK_DATA_FROM_DMEM : The processor's data cache was reloaded from another chip's memory on the same Node or Group (Distant) due to a marked load. +-event:0x2d12c counters:1 um:zero minimum:1000 name:PM_MRK_DATA_FROM_DMEM_CYC : Duration in cycles to reload from another chip's memory on the same Node or Group (Distant) due to a marked load. +-event:0x1d142 counters:0 um:zero minimum:1000 name:PM_MRK_DATA_FROM_L2 : The processor's data cache was reloaded from local core's L2 due to a marked load. +-event:0x4d146 counters:3 um:zero minimum:1000 name:PM_MRK_DATA_FROM_L21_MOD : The processor's data cache was reloaded with Modified (M) data from another core's L2 on the same chip due to a marked load. +-event:0x2d126 counters:1 um:zero minimum:1000 name:PM_MRK_DATA_FROM_L21_MOD_CYC : Duration in cycles to reload with Modified (M) data from another core's L2 on the same chip due to a marked load. +-event:0x3d146 counters:2 um:zero minimum:1000 name:PM_MRK_DATA_FROM_L21_SHR : The processor's data cache was reloaded with Shared (S) data from another core's L2 on the same chip due to a marked load. +-event:0x2c126 counters:1 um:zero minimum:1000 name:PM_MRK_DATA_FROM_L21_SHR_CYC : Duration in cycles to reload with Shared (S) data from another core's L2 on the same chip due to a marked load. +-event:0x4c12e counters:3 um:zero minimum:1000 name:PM_MRK_DATA_FROM_L2MISS_CYC : Duration in cycles to reload from a localtion other than the local core's L2 due to a marked load. +-event:0x4c122 counters:3 um:zero minimum:1000 name:PM_MRK_DATA_FROM_L2_CYC : Duration in cycles to reload from local core's L2 due to a marked load. +-event:0x3d140 counters:2 um:zero minimum:1000 name:PM_MRK_DATA_FROM_L2_DISP_CONFLICT_LDHITST : The processor's data cache was reloaded from local core's L2 with load hit store conflict due to a marked load. +-event:0x2c120 counters:1 um:zero minimum:1000 name:PM_MRK_DATA_FROM_L2_DISP_CONFLICT_LDHITST_CYC : Duration in cycles to reload from local core's L2 with load hit store conflict due to a marked load. +-event:0x4d140 counters:3 um:zero minimum:1000 name:PM_MRK_DATA_FROM_L2_DISP_CONFLICT_OTHER : The processor's data cache was reloaded from local core's L2 with dispatch conflict due to a marked load. +-event:0x2d120 counters:1 um:zero minimum:1000 name:PM_MRK_DATA_FROM_L2_DISP_CONFLICT_OTHER_CYC : Duration in cycles to reload from local core's L2 with dispatch conflict due to a marked load. +-event:0x2d140 counters:1 um:zero minimum:1000 name:PM_MRK_DATA_FROM_L2_MEPF : The processor's data cache was reloaded from local core's L2 hit without dispatch conflicts on Mepf state. due to a marked load. +-event:0x4d120 counters:3 um:zero minimum:1000 name:PM_MRK_DATA_FROM_L2_MEPF_CYC : Duration in cycles to reload from local core's L2 hit without dispatch conflicts on Mepf state. due to a marked load. +-event:0x1d140 counters:0 um:zero minimum:1000 name:PM_MRK_DATA_FROM_L2_NO_CONFLICT : The processor's data cache was reloaded from local core's L2 without conflict due to a marked load. +-event:0x4c120 counters:3 um:zero minimum:1000 name:PM_MRK_DATA_FROM_L2_NO_CONFLICT_CYC : Duration in cycles to reload from local core's L2 without conflict due to a marked load. +-event:0x4d142 counters:3 um:zero minimum:1000 name:PM_MRK_DATA_FROM_L3 : The processor's data cache was reloaded from local core's L3 due to a marked load. +-event:0x4d144 counters:3 um:zero minimum:1000 name:PM_MRK_DATA_FROM_L31_ECO_MOD : The processor's data cache was reloaded with Modified (M) data from another core's ECO L3 on the same chip due to a marked load. +-event:0x2d124 counters:1 um:zero minimum:1000 name:PM_MRK_DATA_FROM_L31_ECO_MOD_CYC : Duration in cycles to reload with Modified (M) data from another core's ECO L3 on the same chip due to a marked load. +-event:0x3d144 counters:2 um:zero minimum:1000 name:PM_MRK_DATA_FROM_L31_ECO_SHR : The processor's data cache was reloaded with Shared (S) data from another core's ECO L3 on the same chip due to a marked load. +-event:0x2c124 counters:1 um:zero minimum:1000 name:PM_MRK_DATA_FROM_L31_ECO_SHR_CYC : Duration in cycles to reload with Shared (S) data from another core's ECO L3 on the same chip due to a marked load. +-event:0x2d144 counters:1 um:zero minimum:1000 name:PM_MRK_DATA_FROM_L31_MOD : The processor's data cache was reloaded with Modified (M) data from another core's L3 on the same chip due to a marked load. +-event:0x4d124 counters:3 um:zero minimum:1000 name:PM_MRK_DATA_FROM_L31_MOD_CYC : Duration in cycles to reload with Modified (M) data from another core's L3 on the same chip due to a marked load. +-event:0x1d146 counters:0 um:zero minimum:1000 name:PM_MRK_DATA_FROM_L31_SHR : The processor's data cache was reloaded with Shared (S) data from another core's L3 on the same chip due to a marked load. +-event:0x4c126 counters:3 um:zero minimum:1000 name:PM_MRK_DATA_FROM_L31_SHR_CYC : Duration in cycles to reload with Shared (S) data from another core's L3 on the same chip due to a marked load. +-event:0x2d12e counters:1 um:zero minimum:1000 name:PM_MRK_DATA_FROM_L3MISS_CYC : Duration in cycles to reload from a localtion other than the local core's L3 due to a marked load. +-event:0x2d122 counters:1 um:zero minimum:1000 name:PM_MRK_DATA_FROM_L3_CYC : Duration in cycles to reload from local core's L3 due to a marked load. +-event:0x3d142 counters:2 um:zero minimum:1000 name:PM_MRK_DATA_FROM_L3_DISP_CONFLICT : The processor's data cache was reloaded from local core's L3 with dispatch conflict due to a marked load. +-event:0x2c122 counters:1 um:zero minimum:1000 name:PM_MRK_DATA_FROM_L3_DISP_CONFLICT_CYC : Duration in cycles to reload from local core's L3 with dispatch conflict due to a marked load. +-event:0x2d142 counters:1 um:zero minimum:1000 name:PM_MRK_DATA_FROM_L3_MEPF : The processor's data cache was reloaded from local core's L3 without dispatch conflicts hit on Mepf state. due to a marked load. +-event:0x4d122 counters:3 um:zero minimum:1000 name:PM_MRK_DATA_FROM_L3_MEPF_CYC : Duration in cycles to reload from local core's L3 without dispatch conflicts hit on Mepf state. due to a marked load. +-event:0x1d144 counters:0 um:zero minimum:1000 name:PM_MRK_DATA_FROM_L3_NO_CONFLICT : The processor's data cache was reloaded from local core's L3 without conflict due to a marked load. +-event:0x4c124 counters:3 um:zero minimum:1000 name:PM_MRK_DATA_FROM_L3_NO_CONFLICT_CYC : Duration in cycles to reload from local core's L3 without conflict due to a marked load. +-event:0x1d14c counters:0 um:zero minimum:1000 name:PM_MRK_DATA_FROM_LL4 : The processor's data cache was reloaded from the local chip's L4 cache due to a marked load. +-event:0x4c12c counters:3 um:zero minimum:1000 name:PM_MRK_DATA_FROM_LL4_CYC : Duration in cycles to reload from the local chip's L4 cache due to a marked load. +-event:0x2d148 counters:1 um:zero minimum:1000 name:PM_MRK_DATA_FROM_LMEM : The processor's data cache was reloaded from the local chip's Memory due to a marked load. +-event:0x4d128 counters:3 um:zero minimum:1000 name:PM_MRK_DATA_FROM_LMEM_CYC : Duration in cycles to reload from the local chip's Memory due to a marked load. +-event:0x2d14c counters:1 um:zero minimum:1000 name:PM_MRK_DATA_FROM_MEMORY : The processor's data cache was reloaded from a memory location including L4 from local remote or distant due to a marked load. +-event:0x4d12c counters:3 um:zero minimum:1000 name:PM_MRK_DATA_FROM_MEMORY_CYC : Duration in cycles to reload from a memory location including L4 from local remote or distant due to a marked load. +-event:0x4d14a counters:3 um:zero minimum:1000 name:PM_MRK_DATA_FROM_OFF_CHIP_CACHE : The processor's data cache was reloaded either shared or modified data from another core's L2/L3 on a different chip (remote or distant) due to a marked load. +-event:0x2d12a counters:1 um:zero minimum:1000 name:PM_MRK_DATA_FROM_OFF_CHIP_CACHE_CYC : Duration in cycles to reload either shared or modified data from another core's L2/L3 on a different chip (remote or distant) due to a marked load. +-event:0x1d148 counters:0 um:zero minimum:1000 name:PM_MRK_DATA_FROM_ON_CHIP_CACHE : The processor's data cache was reloaded either shared or modified data from another core's L2/L3 on the same chip due to a marked load. +-event:0x4c128 counters:3 um:zero minimum:1000 name:PM_MRK_DATA_FROM_ON_CHIP_CACHE_CYC : Duration in cycles to reload either shared or modified data from another core's L2/L3 on the same chip due to a marked load. +-event:0x2d146 counters:1 um:zero minimum:1000 name:PM_MRK_DATA_FROM_RL2L3_MOD : The processor's data cache was reloaded with Modified (M) data from another chip's L2 or L3 on the same Node or Group (Remote), as this chip due to a marked load. +-event:0x4d126 counters:3 um:zero minimum:1000 name:PM_MRK_DATA_FROM_RL2L3_MOD_CYC : Duration in cycles to reload with Modified (M) data from another chip's L2 or L3 on the same Node or Group (Remote), as this chip due to a marked load. +-event:0x1d14a counters:0 um:zero minimum:1000 name:PM_MRK_DATA_FROM_RL2L3_SHR : The processor's data cache was reloaded with Shared (S) data from another chip's L2 or L3 on the same Node or Group (Remote), as this chip due to a marked load. +-event:0x4c12a counters:3 um:zero minimum:1000 name:PM_MRK_DATA_FROM_RL2L3_SHR_CYC : Duration in cycles to reload with Shared (S) data from another chip's L2 or L3 on the same Node or Group (Remote), as this chip due to a marked load. +-event:0x2d14a counters:1 um:zero minimum:1000 name:PM_MRK_DATA_FROM_RL4 : The processor's data cache was reloaded from another chip's L4 on the same Node or Group ( Remote) due to a marked load. +-event:0x4d12a counters:3 um:zero minimum:1000 name:PM_MRK_DATA_FROM_RL4_CYC : Duration in cycles to reload from another chip's L4 on the same Node or Group ( Remote) due to a marked load. +-event:0x3d14a counters:2 um:zero minimum:1000 name:PM_MRK_DATA_FROM_RMEM : The processor's data cache was reloaded from another chip's memory on the same Node or Group ( Remote) due to a marked load. +-event:0x2c12a counters:1 um:zero minimum:1000 name:PM_MRK_DATA_FROM_RMEM_CYC : Duration in cycles to reload from another chip's memory on the same Node or Group ( Remote) due to a marked load. +-event:0x40118 counters:3 um:zero minimum:1000 name:PM_MRK_DCACHE_RELOAD_INTV : Combined Intervention event. +-event:0x4d154 counters:3 um:zero minimum:1000 name:PM_MRK_DERAT_MISS_16G : Marked Data ERAT Miss (Data TLB Access) page size 16G. +-event:0x3d154 counters:2 um:zero minimum:1000 name:PM_MRK_DERAT_MISS_16M : Marked Data ERAT Miss (Data TLB Access) page size 16M. +-event:0x1d156 counters:0 um:zero minimum:1000 name:PM_MRK_DERAT_MISS_4K : Marked Data ERAT Miss (Data TLB Access) page size 4K. +-event:0x2d154 counters:1 um:zero minimum:1000 name:PM_MRK_DERAT_MISS_64K : Marked Data ERAT Miss (Data TLB Access) page size 64K. +-event:0x20132 counters:1 um:zero minimum:1000 name:PM_MRK_DFU_FIN : Decimal Unit marked Instruction Finish. +-event:0x4f148 counters:3 um:zero minimum:1000 name:PM_MRK_DPTEG_FROM_DL2L3_MOD : A Page Table Entry was loaded into the TLB with Modified (M) data from another chip's L2 or L3 on a different Node or Group (Distant), as this chip due to a marked data side request. +-event:0x3f148 counters:2 um:zero minimum:1000 name:PM_MRK_DPTEG_FROM_DL2L3_SHR : A Page Table Entry was loaded into the TLB with Shared (S) data from another chip's L2 or L3 on a different Node or Group (Distant), as this chip due to a marked data side request. +-event:0x3f14c counters:2 um:zero minimum:1000 name:PM_MRK_DPTEG_FROM_DL4 : A Page Table Entry was loaded into the TLB from another chip's L4 on a different Node or Group (Distant) due to a marked data side request. +-event:0x4f14c counters:3 um:zero minimum:1000 name:PM_MRK_DPTEG_FROM_DMEM : A Page Table Entry was loaded into the TLB from another chip's memory on the same Node or Group (Distant) due to a marked data side request. +-event:0x1f142 counters:0 um:zero minimum:1000 name:PM_MRK_DPTEG_FROM_L2 : A Page Table Entry was loaded into the TLB from local core's L2 due to a marked data side request. +-event:0x4f146 counters:3 um:zero minimum:1000 name:PM_MRK_DPTEG_FROM_L21_MOD : A Page Table Entry was loaded into the TLB with Modified (M) data from another core's L2 on the same chip due to a marked data side request. +-event:0x3f146 counters:2 um:zero minimum:1000 name:PM_MRK_DPTEG_FROM_L21_SHR : A Page Table Entry was loaded into the TLB with Shared (S) data from another core's L2 on the same chip due to a marked data side request. +-event:0x1f14e counters:0 um:zero minimum:1000 name:PM_MRK_DPTEG_FROM_L2MISS : A Page Table Entry was loaded into the TLB from a localtion other than the local core's L2 due to a marked data side request. +-event:0x3f140 counters:2 um:zero minimum:1000 name:PM_MRK_DPTEG_FROM_L2_DISP_CONFLICT_LDHITST : A Page Table Entry was loaded into the TLB from local core's L2 with load hit store conflict due to a marked data side request. +-event:0x4f140 counters:3 um:zero minimum:1000 name:PM_MRK_DPTEG_FROM_L2_DISP_CONFLICT_OTHER : A Page Table Entry was loaded into the TLB from local core's L2 with dispatch conflict due to a marked data side request. +-event:0x2f140 counters:1 um:zero minimum:1000 name:PM_MRK_DPTEG_FROM_L2_MEPF : A Page Table Entry was loaded into the TLB from local core's L2 hit without dispatch conflicts on Mepf state. due to a marked data side request. +-event:0x1f140 counters:0 um:zero minimum:1000 name:PM_MRK_DPTEG_FROM_L2_NO_CONFLICT : A Page Table Entry was loaded into the TLB from local core's L2 without conflict due to a marked data side request. +-event:0x4f142 counters:3 um:zero minimum:1000 name:PM_MRK_DPTEG_FROM_L3 : A Page Table Entry was loaded into the TLB from local core's L3 due to a marked data side request. +-event:0x4f144 counters:3 um:zero minimum:1000 name:PM_MRK_DPTEG_FROM_L31_ECO_MOD : A Page Table Entry was loaded into the TLB with Modified (M) data from another core's ECO L3 on the same chip due to a marked data side request. +-event:0x3f144 counters:2 um:zero minimum:1000 name:PM_MRK_DPTEG_FROM_L31_ECO_SHR : A Page Table Entry was loaded into the TLB with Shared (S) data from another core's ECO L3 on the same chip due to a marked data side request. +-event:0x2f144 counters:1 um:zero minimum:1000 name:PM_MRK_DPTEG_FROM_L31_MOD : A Page Table Entry was loaded into the TLB with Modified (M) data from another core's L3 on the same chip due to a marked data side request. +-event:0x1f146 counters:0 um:zero minimum:1000 name:PM_MRK_DPTEG_FROM_L31_SHR : A Page Table Entry was loaded into the TLB with Shared (S) data from another core's L3 on the same chip due to a marked data side request. +-event:0x4f14e counters:3 um:zero minimum:1000 name:PM_MRK_DPTEG_FROM_L3MISS : A Page Table Entry was loaded into the TLB from a localtion other than the local core's L3 due to a marked data side request. +-event:0x3f142 counters:2 um:zero minimum:1000 name:PM_MRK_DPTEG_FROM_L3_DISP_CONFLICT : A Page Table Entry was loaded into the TLB from local core's L3 with dispatch conflict due to a marked data side request. +-event:0x2f142 counters:1 um:zero minimum:1000 name:PM_MRK_DPTEG_FROM_L3_MEPF : A Page Table Entry was loaded into the TLB from local core's L3 without dispatch conflicts hit on Mepf state. due to a marked data side request. +-event:0x1f144 counters:0 um:zero minimum:1000 name:PM_MRK_DPTEG_FROM_L3_NO_CONFLICT : A Page Table Entry was loaded into the TLB from local core's L3 without conflict due to a marked data side request. +-event:0x1f14c counters:0 um:zero minimum:1000 name:PM_MRK_DPTEG_FROM_LL4 : A Page Table Entry was loaded into the TLB from the local chip's L4 cache due to a marked data side request. +-event:0x2f148 counters:1 um:zero minimum:1000 name:PM_MRK_DPTEG_FROM_LMEM : A Page Table Entry was loaded into the TLB from the local chip's Memory due to a marked data side request. +-event:0x2f14c counters:1 um:zero minimum:1000 name:PM_MRK_DPTEG_FROM_MEMORY : A Page Table Entry was loaded into the TLB from a memory location including L4 from local remote or distant due to a marked data side request. +-event:0x4f14a counters:3 um:zero minimum:1000 name:PM_MRK_DPTEG_FROM_OFF_CHIP_CACHE : A Page Table Entry was loaded into the TLB either shared or modified data from another core's L2/L3 on a different chip (remote or distant) due to a marked data side request. +-event:0x1f148 counters:0 um:zero minimum:1000 name:PM_MRK_DPTEG_FROM_ON_CHIP_CACHE : A Page Table Entry was loaded into the TLB either shared or modified data from another core's L2/L3 on the same chip due to a marked data side request. +-event:0x2f146 counters:1 um:zero minimum:1000 name:PM_MRK_DPTEG_FROM_RL2L3_MOD : A Page Table Entry was loaded into the TLB with Modified (M) data from another chip's L2 or L3 on the same Node or Group (Remote), as this chip due to a marked data side request. +-event:0x1f14a counters:0 um:zero minimum:1000 name:PM_MRK_DPTEG_FROM_RL2L3_SHR : A Page Table Entry was loaded into the TLB with Shared (S) data from another chip's L2 or L3 on the same Node or Group (Remote), as this chip due to a marked data side request. +-event:0x2f14a counters:1 um:zero minimum:1000 name:PM_MRK_DPTEG_FROM_RL4 : A Page Table Entry was loaded into the TLB from another chip's L4 on the same Node or Group ( Remote) due to a marked data side request. +-event:0x3f14a counters:2 um:zero minimum:1000 name:PM_MRK_DPTEG_FROM_RMEM : A Page Table Entry was loaded into the TLB from another chip's memory on the same Node or Group ( Remote) due to a marked data side request. +-event:0x1d158 counters:0 um:zero minimum:1000 name:PM_MRK_DTLB_MISS_16G : Marked Data TLB Miss page size 16G. +-event:0x4d156 counters:3 um:zero minimum:1000 name:PM_MRK_DTLB_MISS_16M : Marked Data TLB Miss page size 16M. +-event:0x2d156 counters:1 um:zero minimum:1000 name:PM_MRK_DTLB_MISS_4K : Marked Data TLB Miss page size 4k. +-event:0x3d156 counters:2 um:zero minimum:1000 name:PM_MRK_DTLB_MISS_64K : Marked Data TLB Miss page size 64K. +-event:0x40154 counters:3 um:zero minimum:1000 name:PM_MRK_FAB_RSP_BKILL : Marked store had to do a bkill. +-event:0x2f150 counters:1 um:zero minimum:1000 name:PM_MRK_FAB_RSP_BKILL_CYC : cycles L2 RC took for a bkill. +-event:0x3015e counters:2 um:zero minimum:1000 name:PM_MRK_FAB_RSP_CLAIM_RTY : Sampled store did a rwitm and got a rty. +-event:0x30154 counters:2 um:zero minimum:1000 name:PM_MRK_FAB_RSP_DCLAIM : Marked store had to do a dclaim. +-event:0x2f152 counters:1 um:zero minimum:1000 name:PM_MRK_FAB_RSP_DCLAIM_CYC : cycles L2 RC took for a dclaim. +-event:0x30156 counters:2 um:zero minimum:1000 name:PM_MRK_FAB_RSP_MATCH : ttype and cresp matched as specified in MMCR1. +-event:0x4f152 counters:3 um:zero minimum:1000 name:PM_MRK_FAB_RSP_MATCH_CYC : cresp/ttype match cycles. +-event:0x4015e counters:3 um:zero minimum:1000 name:PM_MRK_FAB_RSP_RD_RTY : Sampled L2 reads retry count. +-event:0x1015e counters:0 um:zero minimum:1000 name:PM_MRK_FAB_RSP_RD_T_INTV : Sampled Read got a T intervention. +-event:0x4f150 counters:3 um:zero minimum:1000 name:PM_MRK_FAB_RSP_RWITM_CYC : cycles L2 RC took for a rwitm. +-event:0x2015e counters:1 um:zero minimum:1000 name:PM_MRK_FAB_RSP_RWITM_RTY : Sampled store did a rwitm and got a rty. +-event:0x3012e counters:2 um:zero minimum:1000 name:PM_MRK_FILT_MATCH : Marked filter Match. +-event:0x1013c counters:0 um:zero minimum:1000 name:PM_MRK_FIN_STALL_CYC : Marked instruction Finish Stall cycles (marked finish after NTC) (use edge detect to count #). +-event:0x20134 counters:1 um:zero minimum:1000 name:PM_MRK_FXU_FIN : fxu marked instr finish. ++event:0x3515e counters:2 um:zero minimum:100 name:PM_MRK_BACK_BR_CMPL : Marked branch instruction completed with a target address less than current instruction address. ++event:0x2013a counters:1 um:zero minimum:100 name:PM_MRK_BRU_FIN : bru marked instr finish. ++event:0x1016e counters:0 um:zero minimum:100 name:PM_MRK_BR_CMPL : Branch Instruction completed. ++event:0x3013a counters:2 um:zero minimum:100 name:PM_MRK_CRU_FIN : IFU non-branch marked instruction finished. ++event:0x4d148 counters:3 um:zero minimum:100 name:PM_MRK_DATA_FROM_DL2L3_MOD : The processor's data cache was reloaded with Modified (M) data from another chip's L2 or L3 on a different Node or Group (Distant), as this chip due to a marked load. ++event:0x2d128 counters:1 um:zero minimum:100 name:PM_MRK_DATA_FROM_DL2L3_MOD_CYC : Duration in cycles to reload with Modified (M) data from another chip's L2 or L3 on a different Node or Group (Distant), as this chip due to a marked load. ++event:0x3d148 counters:2 um:zero minimum:100 name:PM_MRK_DATA_FROM_DL2L3_SHR : The processor's data cache was reloaded with Shared (S) data from another chip's L2 or L3 on a different Node or Group (Distant), as this chip due to a marked load. ++event:0x2c128 counters:1 um:zero minimum:100 name:PM_MRK_DATA_FROM_DL2L3_SHR_CYC : Duration in cycles to reload with Shared (S) data from another chip's L2 or L3 on a different Node or Group (Distant), as this chip due to a marked load. ++event:0x3d14c counters:2 um:zero minimum:100 name:PM_MRK_DATA_FROM_DL4 : The processor's data cache was reloaded from another chip's L4 on a different Node or Group (Distant) due to a marked load. ++event:0x2c12c counters:1 um:zero minimum:100 name:PM_MRK_DATA_FROM_DL4_CYC : Duration in cycles to reload from another chip's L4 on a different Node or Group (Distant) due to a marked load. ++event:0x4d14c counters:3 um:zero minimum:100 name:PM_MRK_DATA_FROM_DMEM : The processor's data cache was reloaded from another chip's memory on the same Node or Group (Distant) due to a marked load. ++event:0x2d12c counters:1 um:zero minimum:100 name:PM_MRK_DATA_FROM_DMEM_CYC : Duration in cycles to reload from another chip's memory on the same Node or Group (Distant) due to a marked load. ++event:0x1d142 counters:0 um:zero minimum:100 name:PM_MRK_DATA_FROM_L2 : The processor's data cache was reloaded from local core's L2 due to a marked load. ++event:0x4d146 counters:3 um:zero minimum:100 name:PM_MRK_DATA_FROM_L21_MOD : The processor's data cache was reloaded with Modified (M) data from another core's L2 on the same chip due to a marked load. ++event:0x2d126 counters:1 um:zero minimum:100 name:PM_MRK_DATA_FROM_L21_MOD_CYC : Duration in cycles to reload with Modified (M) data from another core's L2 on the same chip due to a marked load. ++event:0x3d146 counters:2 um:zero minimum:100 name:PM_MRK_DATA_FROM_L21_SHR : The processor's data cache was reloaded with Shared (S) data from another core's L2 on the same chip due to a marked load. ++event:0x2c126 counters:1 um:zero minimum:100 name:PM_MRK_DATA_FROM_L21_SHR_CYC : Duration in cycles to reload with Shared (S) data from another core's L2 on the same chip due to a marked load. ++event:0x4c12e counters:3 um:zero minimum:100 name:PM_MRK_DATA_FROM_L2MISS_CYC : Duration in cycles to reload from a localtion other than the local core's L2 due to a marked load. ++event:0x4c122 counters:3 um:zero minimum:100 name:PM_MRK_DATA_FROM_L2_CYC : Duration in cycles to reload from local core's L2 due to a marked load. ++event:0x3d140 counters:2 um:zero minimum:100 name:PM_MRK_DATA_FROM_L2_DISP_CONFLICT_LDHITST : The processor's data cache was reloaded from local core's L2 with load hit store conflict due to a marked load. ++event:0x2c120 counters:1 um:zero minimum:100 name:PM_MRK_DATA_FROM_L2_DISP_CONFLICT_LDHITST_CYC : Duration in cycles to reload from local core's L2 with load hit store conflict due to a marked load. ++event:0x4d140 counters:3 um:zero minimum:100 name:PM_MRK_DATA_FROM_L2_DISP_CONFLICT_OTHER : The processor's data cache was reloaded from local core's L2 with dispatch conflict due to a marked load. ++event:0x2d120 counters:1 um:zero minimum:100 name:PM_MRK_DATA_FROM_L2_DISP_CONFLICT_OTHER_CYC : Duration in cycles to reload from local core's L2 with dispatch conflict due to a marked load. ++event:0x2d140 counters:1 um:zero minimum:100 name:PM_MRK_DATA_FROM_L2_MEPF : The processor's data cache was reloaded from local core's L2 hit without dispatch conflicts on Mepf state. due to a marked load. ++event:0x4d120 counters:3 um:zero minimum:100 name:PM_MRK_DATA_FROM_L2_MEPF_CYC : Duration in cycles to reload from local core's L2 hit without dispatch conflicts on Mepf state. due to a marked load. ++event:0x1d140 counters:0 um:zero minimum:100 name:PM_MRK_DATA_FROM_L2_NO_CONFLICT : The processor's data cache was reloaded from local core's L2 without conflict due to a marked load. ++event:0x4c120 counters:3 um:zero minimum:100 name:PM_MRK_DATA_FROM_L2_NO_CONFLICT_CYC : Duration in cycles to reload from local core's L2 without conflict due to a marked load. ++event:0x4d142 counters:3 um:zero minimum:100 name:PM_MRK_DATA_FROM_L3 : The processor's data cache was reloaded from local core's L3 due to a marked load. ++event:0x4d144 counters:3 um:zero minimum:100 name:PM_MRK_DATA_FROM_L31_ECO_MOD : The processor's data cache was reloaded with Modified (M) data from another core's ECO L3 on the same chip due to a marked load. ++event:0x2d124 counters:1 um:zero minimum:100 name:PM_MRK_DATA_FROM_L31_ECO_MOD_CYC : Duration in cycles to reload with Modified (M) data from another core's ECO L3 on the same chip due to a marked load. ++event:0x3d144 counters:2 um:zero minimum:100 name:PM_MRK_DATA_FROM_L31_ECO_SHR : The processor's data cache was reloaded with Shared (S) data from another core's ECO L3 on the same chip due to a marked load. ++event:0x2c124 counters:1 um:zero minimum:100 name:PM_MRK_DATA_FROM_L31_ECO_SHR_CYC : Duration in cycles to reload with Shared (S) data from another core's ECO L3 on the same chip due to a marked load. ++event:0x2d144 counters:1 um:zero minimum:100 name:PM_MRK_DATA_FROM_L31_MOD : The processor's data cache was reloaded with Modified (M) data from another core's L3 on the same chip due to a marked load. ++event:0x4d124 counters:3 um:zero minimum:100 name:PM_MRK_DATA_FROM_L31_MOD_CYC : Duration in cycles to reload with Modified (M) data from another core's L3 on the same chip due to a marked load. ++event:0x1d146 counters:0 um:zero minimum:100 name:PM_MRK_DATA_FROM_L31_SHR : The processor's data cache was reloaded with Shared (S) data from another core's L3 on the same chip due to a marked load. ++event:0x4c126 counters:3 um:zero minimum:100 name:PM_MRK_DATA_FROM_L31_SHR_CYC : Duration in cycles to reload with Shared (S) data from another core's L3 on the same chip due to a marked load. ++event:0x2d12e counters:1 um:zero minimum:100 name:PM_MRK_DATA_FROM_L3MISS_CYC : Duration in cycles to reload from a localtion other than the local core's L3 due to a marked load. ++event:0x2d122 counters:1 um:zero minimum:100 name:PM_MRK_DATA_FROM_L3_CYC : Duration in cycles to reload from local core's L3 due to a marked load. ++event:0x3d142 counters:2 um:zero minimum:100 name:PM_MRK_DATA_FROM_L3_DISP_CONFLICT : The processor's data cache was reloaded from local core's L3 with dispatch conflict due to a marked load. ++event:0x2c122 counters:1 um:zero minimum:100 name:PM_MRK_DATA_FROM_L3_DISP_CONFLICT_CYC : Duration in cycles to reload from local core's L3 with dispatch conflict due to a marked load. ++event:0x2d142 counters:1 um:zero minimum:100 name:PM_MRK_DATA_FROM_L3_MEPF : The processor's data cache was reloaded from local core's L3 without dispatch conflicts hit on Mepf state. due to a marked load. ++event:0x4d122 counters:3 um:zero minimum:100 name:PM_MRK_DATA_FROM_L3_MEPF_CYC : Duration in cycles to reload from local core's L3 without dispatch conflicts hit on Mepf state. due to a marked load. ++event:0x1d144 counters:0 um:zero minimum:100 name:PM_MRK_DATA_FROM_L3_NO_CONFLICT : The processor's data cache was reloaded from local core's L3 without conflict due to a marked load. ++event:0x4c124 counters:3 um:zero minimum:100 name:PM_MRK_DATA_FROM_L3_NO_CONFLICT_CYC : Duration in cycles to reload from local core's L3 without conflict due to a marked load. ++event:0x1d14c counters:0 um:zero minimum:100 name:PM_MRK_DATA_FROM_LL4 : The processor's data cache was reloaded from the local chip's L4 cache due to a marked load. ++event:0x4c12c counters:3 um:zero minimum:100 name:PM_MRK_DATA_FROM_LL4_CYC : Duration in cycles to reload from the local chip's L4 cache due to a marked load. ++event:0x2d148 counters:1 um:zero minimum:100 name:PM_MRK_DATA_FROM_LMEM : The processor's data cache was reloaded from the local chip's Memory due to a marked load. ++event:0x4d128 counters:3 um:zero minimum:100 name:PM_MRK_DATA_FROM_LMEM_CYC : Duration in cycles to reload from the local chip's Memory due to a marked load. ++event:0x2d14c counters:1 um:zero minimum:100 name:PM_MRK_DATA_FROM_MEMORY : The processor's data cache was reloaded from a memory location including L4 from local remote or distant due to a marked load. ++event:0x4d12c counters:3 um:zero minimum:100 name:PM_MRK_DATA_FROM_MEMORY_CYC : Duration in cycles to reload from a memory location including L4 from local remote or distant due to a marked load. ++event:0x4d14a counters:3 um:zero minimum:100 name:PM_MRK_DATA_FROM_OFF_CHIP_CACHE : The processor's data cache was reloaded either shared or modified data from another core's L2/L3 on a different chip (remote or distant) due to a marked load. ++event:0x2d12a counters:1 um:zero minimum:100 name:PM_MRK_DATA_FROM_OFF_CHIP_CACHE_CYC : Duration in cycles to reload either shared or modified data from another core's L2/L3 on a different chip (remote or distant) due to a marked load. ++event:0x1d148 counters:0 um:zero minimum:100 name:PM_MRK_DATA_FROM_ON_CHIP_CACHE : The processor's data cache was reloaded either shared or modified data from another core's L2/L3 on the same chip due to a marked load. ++event:0x4c128 counters:3 um:zero minimum:100 name:PM_MRK_DATA_FROM_ON_CHIP_CACHE_CYC : Duration in cycles to reload either shared or modified data from another core's L2/L3 on the same chip due to a marked load. ++event:0x2d146 counters:1 um:zero minimum:100 name:PM_MRK_DATA_FROM_RL2L3_MOD : The processor's data cache was reloaded with Modified (M) data from another chip's L2 or L3 on the same Node or Group (Remote), as this chip due to a marked load. ++event:0x4d126 counters:3 um:zero minimum:100 name:PM_MRK_DATA_FROM_RL2L3_MOD_CYC : Duration in cycles to reload with Modified (M) data from another chip's L2 or L3 on the same Node or Group (Remote), as this chip due to a marked load. ++event:0x1d14a counters:0 um:zero minimum:100 name:PM_MRK_DATA_FROM_RL2L3_SHR : The processor's data cache was reloaded with Shared (S) data from another chip's L2 or L3 on the same Node or Group (Remote), as this chip due to a marked load. ++event:0x4c12a counters:3 um:zero minimum:100 name:PM_MRK_DATA_FROM_RL2L3_SHR_CYC : Duration in cycles to reload with Shared (S) data from another chip's L2 or L3 on the same Node or Group (Remote), as this chip due to a marked load. ++event:0x2d14a counters:1 um:zero minimum:100 name:PM_MRK_DATA_FROM_RL4 : The processor's data cache was reloaded from another chip's L4 on the same Node or Group ( Remote) due to a marked load. ++event:0x4d12a counters:3 um:zero minimum:100 name:PM_MRK_DATA_FROM_RL4_CYC : Duration in cycles to reload from another chip's L4 on the same Node or Group ( Remote) due to a marked load. ++event:0x3d14a counters:2 um:zero minimum:100 name:PM_MRK_DATA_FROM_RMEM : The processor's data cache was reloaded from another chip's memory on the same Node or Group ( Remote) due to a marked load. ++event:0x2c12a counters:1 um:zero minimum:100 name:PM_MRK_DATA_FROM_RMEM_CYC : Duration in cycles to reload from another chip's memory on the same Node or Group ( Remote) due to a marked load. ++event:0x40118 counters:3 um:zero minimum:100 name:PM_MRK_DCACHE_RELOAD_INTV : Combined Intervention event. ++event:0x4d154 counters:3 um:zero minimum:100 name:PM_MRK_DERAT_MISS_16G : Marked Data ERAT Miss (Data TLB Access) page size 16G. ++event:0x3d154 counters:2 um:zero minimum:100 name:PM_MRK_DERAT_MISS_16M : Marked Data ERAT Miss (Data TLB Access) page size 16M. ++event:0x1d156 counters:0 um:zero minimum:100 name:PM_MRK_DERAT_MISS_4K : Marked Data ERAT Miss (Data TLB Access) page size 4K. ++event:0x2d154 counters:1 um:zero minimum:100 name:PM_MRK_DERAT_MISS_64K : Marked Data ERAT Miss (Data TLB Access) page size 64K. ++event:0x20132 counters:1 um:zero minimum:100 name:PM_MRK_DFU_FIN : Decimal Unit marked Instruction Finish. ++event:0x4f148 counters:3 um:zero minimum:100 name:PM_MRK_DPTEG_FROM_DL2L3_MOD : A Page Table Entry was loaded into the TLB with Modified (M) data from another chip's L2 or L3 on a different Node or Group (Distant), as this chip due to a marked data side request. ++event:0x3f148 counters:2 um:zero minimum:100 name:PM_MRK_DPTEG_FROM_DL2L3_SHR : A Page Table Entry was loaded into the TLB with Shared (S) data from another chip's L2 or L3 on a different Node or Group (Distant), as this chip due to a marked data side request. ++event:0x3f14c counters:2 um:zero minimum:100 name:PM_MRK_DPTEG_FROM_DL4 : A Page Table Entry was loaded into the TLB from another chip's L4 on a different Node or Group (Distant) due to a marked data side request. ++event:0x4f14c counters:3 um:zero minimum:100 name:PM_MRK_DPTEG_FROM_DMEM : A Page Table Entry was loaded into the TLB from another chip's memory on the same Node or Group (Distant) due to a marked data side request. ++event:0x1f142 counters:0 um:zero minimum:100 name:PM_MRK_DPTEG_FROM_L2 : A Page Table Entry was loaded into the TLB from local core's L2 due to a marked data side request. ++event:0x4f146 counters:3 um:zero minimum:100 name:PM_MRK_DPTEG_FROM_L21_MOD : A Page Table Entry was loaded into the TLB with Modified (M) data from another core's L2 on the same chip due to a marked data side request. ++event:0x3f146 counters:2 um:zero minimum:100 name:PM_MRK_DPTEG_FROM_L21_SHR : A Page Table Entry was loaded into the TLB with Shared (S) data from another core's L2 on the same chip due to a marked data side request. ++event:0x1f14e counters:0 um:zero minimum:100 name:PM_MRK_DPTEG_FROM_L2MISS : A Page Table Entry was loaded into the TLB from a localtion other than the local core's L2 due to a marked data side request. ++event:0x3f140 counters:2 um:zero minimum:100 name:PM_MRK_DPTEG_FROM_L2_DISP_CONFLICT_LDHITST : A Page Table Entry was loaded into the TLB from local core's L2 with load hit store conflict due to a marked data side request. ++event:0x4f140 counters:3 um:zero minimum:100 name:PM_MRK_DPTEG_FROM_L2_DISP_CONFLICT_OTHER : A Page Table Entry was loaded into the TLB from local core's L2 with dispatch conflict due to a marked data side request. ++event:0x2f140 counters:1 um:zero minimum:100 name:PM_MRK_DPTEG_FROM_L2_MEPF : A Page Table Entry was loaded into the TLB from local core's L2 hit without dispatch conflicts on Mepf state. due to a marked data side request. ++event:0x1f140 counters:0 um:zero minimum:100 name:PM_MRK_DPTEG_FROM_L2_NO_CONFLICT : A Page Table Entry was loaded into the TLB from local core's L2 without conflict due to a marked data side request. ++event:0x4f142 counters:3 um:zero minimum:100 name:PM_MRK_DPTEG_FROM_L3 : A Page Table Entry was loaded into the TLB from local core's L3 due to a marked data side request. ++event:0x4f144 counters:3 um:zero minimum:100 name:PM_MRK_DPTEG_FROM_L31_ECO_MOD : A Page Table Entry was loaded into the TLB with Modified (M) data from another core's ECO L3 on the same chip due to a marked data side request. ++event:0x3f144 counters:2 um:zero minimum:100 name:PM_MRK_DPTEG_FROM_L31_ECO_SHR : A Page Table Entry was loaded into the TLB with Shared (S) data from another core's ECO L3 on the same chip due to a marked data side request. ++event:0x2f144 counters:1 um:zero minimum:100 name:PM_MRK_DPTEG_FROM_L31_MOD : A Page Table Entry was loaded into the TLB with Modified (M) data from another core's L3 on the same chip due to a marked data side request. ++event:0x1f146 counters:0 um:zero minimum:100 name:PM_MRK_DPTEG_FROM_L31_SHR : A Page Table Entry was loaded into the TLB with Shared (S) data from another core's L3 on the same chip due to a marked data side request. ++event:0x4f14e counters:3 um:zero minimum:100 name:PM_MRK_DPTEG_FROM_L3MISS : A Page Table Entry was loaded into the TLB from a localtion other than the local core's L3 due to a marked data side request. ++event:0x3f142 counters:2 um:zero minimum:100 name:PM_MRK_DPTEG_FROM_L3_DISP_CONFLICT : A Page Table Entry was loaded into the TLB from local core's L3 with dispatch conflict due to a marked data side request. ++event:0x2f142 counters:1 um:zero minimum:100 name:PM_MRK_DPTEG_FROM_L3_MEPF : A Page Table Entry was loaded into the TLB from local core's L3 without dispatch conflicts hit on Mepf state. due to a marked data side request. ++event:0x1f144 counters:0 um:zero minimum:100 name:PM_MRK_DPTEG_FROM_L3_NO_CONFLICT : A Page Table Entry was loaded into the TLB from local core's L3 without conflict due to a marked data side request. ++event:0x1f14c counters:0 um:zero minimum:100 name:PM_MRK_DPTEG_FROM_LL4 : A Page Table Entry was loaded into the TLB from the local chip's L4 cache due to a marked data side request. ++event:0x2f148 counters:1 um:zero minimum:100 name:PM_MRK_DPTEG_FROM_LMEM : A Page Table Entry was loaded into the TLB from the local chip's Memory due to a marked data side request. ++event:0x2f14c counters:1 um:zero minimum:100 name:PM_MRK_DPTEG_FROM_MEMORY : A Page Table Entry was loaded into the TLB from a memory location including L4 from local remote or distant due to a marked data side request. ++event:0x4f14a counters:3 um:zero minimum:100 name:PM_MRK_DPTEG_FROM_OFF_CHIP_CACHE : A Page Table Entry was loaded into the TLB either shared or modified data from another core's L2/L3 on a different chip (remote or distant) due to a marked data side request. ++event:0x1f148 counters:0 um:zero minimum:100 name:PM_MRK_DPTEG_FROM_ON_CHIP_CACHE : A Page Table Entry was loaded into the TLB either shared or modified data from another core's L2/L3 on the same chip due to a marked data side request. ++event:0x2f146 counters:1 um:zero minimum:100 name:PM_MRK_DPTEG_FROM_RL2L3_MOD : A Page Table Entry was loaded into the TLB with Modified (M) data from another chip's L2 or L3 on the same Node or Group (Remote), as this chip due to a marked data side request. ++event:0x1f14a counters:0 um:zero minimum:100 name:PM_MRK_DPTEG_FROM_RL2L3_SHR : A Page Table Entry was loaded into the TLB with Shared (S) data from another chip's L2 or L3 on the same Node or Group (Remote), as this chip due to a marked data side request. ++event:0x2f14a counters:1 um:zero minimum:100 name:PM_MRK_DPTEG_FROM_RL4 : A Page Table Entry was loaded into the TLB from another chip's L4 on the same Node or Group ( Remote) due to a marked data side request. ++event:0x3f14a counters:2 um:zero minimum:100 name:PM_MRK_DPTEG_FROM_RMEM : A Page Table Entry was loaded into the TLB from another chip's memory on the same Node or Group ( Remote) due to a marked data side request. ++event:0x1d158 counters:0 um:zero minimum:100 name:PM_MRK_DTLB_MISS_16G : Marked Data TLB Miss page size 16G. ++event:0x4d156 counters:3 um:zero minimum:100 name:PM_MRK_DTLB_MISS_16M : Marked Data TLB Miss page size 16M. ++event:0x2d156 counters:1 um:zero minimum:100 name:PM_MRK_DTLB_MISS_4K : Marked Data TLB Miss page size 4k. ++event:0x3d156 counters:2 um:zero minimum:100 name:PM_MRK_DTLB_MISS_64K : Marked Data TLB Miss page size 64K. ++event:0x40154 counters:3 um:zero minimum:100 name:PM_MRK_FAB_RSP_BKILL : Marked store had to do a bkill. ++event:0x2f150 counters:1 um:zero minimum:100 name:PM_MRK_FAB_RSP_BKILL_CYC : cycles L2 RC took for a bkill. ++event:0x3015e counters:2 um:zero minimum:100 name:PM_MRK_FAB_RSP_CLAIM_RTY : Sampled store did a rwitm and got a rty. ++event:0x30154 counters:2 um:zero minimum:100 name:PM_MRK_FAB_RSP_DCLAIM : Marked store had to do a dclaim. ++event:0x2f152 counters:1 um:zero minimum:100 name:PM_MRK_FAB_RSP_DCLAIM_CYC : cycles L2 RC took for a dclaim. ++event:0x30156 counters:2 um:zero minimum:100 name:PM_MRK_FAB_RSP_MATCH : ttype and cresp matched as specified in MMCR1. ++event:0x4f152 counters:3 um:zero minimum:100 name:PM_MRK_FAB_RSP_MATCH_CYC : cresp/ttype match cycles. ++event:0x4015e counters:3 um:zero minimum:100 name:PM_MRK_FAB_RSP_RD_RTY : Sampled L2 reads retry count. ++event:0x1015e counters:0 um:zero minimum:100 name:PM_MRK_FAB_RSP_RD_T_INTV : Sampled Read got a T intervention. ++event:0x4f150 counters:3 um:zero minimum:100 name:PM_MRK_FAB_RSP_RWITM_CYC : cycles L2 RC took for a rwitm. ++event:0x2015e counters:1 um:zero minimum:100 name:PM_MRK_FAB_RSP_RWITM_RTY : Sampled store did a rwitm and got a rty. ++event:0x3012e counters:2 um:zero minimum:100 name:PM_MRK_FILT_MATCH : Marked filter Match. ++event:0x1013c counters:0 um:zero minimum:100 name:PM_MRK_FIN_STALL_CYC : Marked instruction Finish Stall cycles (marked finish after NTC) (use edge detect to count #). ++event:0x20134 counters:1 um:zero minimum:100 name:PM_MRK_FXU_FIN : fxu marked instr finish. + event:0x40130 counters:3 um:zero minimum:1000 name:PM_MRK_GRP_CMPL : marked instruction finished (completed). +-event:0x4013a counters:3 um:zero minimum:1000 name:PM_MRK_GRP_IC_MISS : Marked Group experienced I cache miss. +-event:0x3013c counters:2 um:zero minimum:1000 name:PM_MRK_GRP_NTC : Marked group ntc cycles. +-event:0x20130 counters:1 um:zero minimum:1000 name:PM_MRK_INST_DECODED : marked instruction decoded. Name from ISU? ++event:0x4013a counters:3 um:zero minimum:100 name:PM_MRK_GRP_IC_MISS : Marked Group experienced I cache miss. ++event:0x3013c counters:2 um:zero minimum:100 name:PM_MRK_GRP_NTC : Marked group ntc cycles. ++event:0x20130 counters:1 um:zero minimum:100 name:PM_MRK_INST_DECODED : marked instruction decoded. Name from ISU? + event:0x30130 counters:2 um:zero minimum:1000 name:PM_MRK_INST_FIN : marked instr finish any unit . + event:0x10132 counters:0 um:zero minimum:1000 name:PM_MRK_INST_ISSUED : Marked instruction issued. +-event:0x40134 counters:3 um:zero minimum:1000 name:PM_MRK_INST_TIMEO : marked Instruction finish timeout (instruction lost). +-event:0x20114 counters:1 um:zero minimum:1000 name:PM_MRK_L2_RC_DISP : Marked Instruction RC dispatched in L2. +-event:0x3012a counters:2 um:zero minimum:1000 name:PM_MRK_L2_RC_DONE : Marked RC done. +-event:0x40116 counters:3 um:zero minimum:1000 name:PM_MRK_LARX_FIN : Larx finished . +-event:0x1013f counters:0 um:zero minimum:1000 name:PM_MRK_LD_MISS_EXPOSED : Marked Load exposed Miss (use edge detect to count #) +-event:0x1013e counters:0 um:zero minimum:1000 name:PM_MRK_LD_MISS_EXPOSED_CYC : Marked Load exposed Miss (use edge detect to count #). +-event:0x4013e counters:3 um:zero minimum:1000 name:PM_MRK_LD_MISS_L1_CYC : Marked ld latency. +-event:0x40132 counters:3 um:zero minimum:1000 name:PM_MRK_LSU_FIN : lsu marked instr finish. +-event:0xd180 counters:0,1,2,3 um:zero minimum:1000 name:PM_MRK_LSU_FLUSH : Flush: (marked) : All Cases42 +-event:0xd188 counters:0,1,2,3 um:zero minimum:1000 name:PM_MRK_LSU_FLUSH_LRQ : Flush: (marked) LRQMarked LRQ flushes +-event:0xd18a counters:0,1,2,3 um:zero minimum:1000 name:PM_MRK_LSU_FLUSH_SRQ : Flush: (marked) SRQMarked SRQ lhs flushes +-event:0xd184 counters:0,1,2,3 um:zero minimum:1000 name:PM_MRK_LSU_FLUSH_ULD : Flush: (marked) Unaligned LoadMarked unaligned load flushes +-event:0xd186 counters:0,1,2,3 um:zero minimum:1000 name:PM_MRK_LSU_FLUSH_UST : Flush: (marked) Unaligned StoreMarked unaligned store flushes +-event:0x40164 counters:3 um:zero minimum:1000 name:PM_MRK_LSU_REJECT : LSU marked reject (up to 2 per cycle). +-event:0x30164 counters:2 um:zero minimum:1000 name:PM_MRK_LSU_REJECT_ERAT_MISS : LSU marked reject due to ERAT (up to 2 per cycle). +-event:0x20112 counters:1 um:zero minimum:1000 name:PM_MRK_NTF_FIN : Marked next to finish instruction finished. +-event:0x1d15e counters:0 um:zero minimum:10000 name:PM_MRK_RUN_CYC : Marked run cycles. +-event:0x1d15a counters:0 um:zero minimum:1000 name:PM_MRK_SRC_PREF_TRACK_EFF : Marked src pref track was effective. +-event:0x3d15a counters:2 um:zero minimum:1000 name:PM_MRK_SRC_PREF_TRACK_INEFF : Prefetch tracked was ineffective for marked src. +-event:0x4d15c counters:3 um:zero minimum:1000 name:PM_MRK_SRC_PREF_TRACK_MOD : Prefetch tracked was moderate for marked src. +-event:0x1d15c counters:0 um:zero minimum:1000 name:PM_MRK_SRC_PREF_TRACK_MOD_L2 : Marked src Prefetch Tracked was moderate (source L2). +-event:0x3d15c counters:2 um:zero minimum:1000 name:PM_MRK_SRC_PREF_TRACK_MOD_L3 : Prefetch tracked was moderate (L3 hit) for marked src. +-event:0x3013e counters:2 um:zero minimum:1000 name:PM_MRK_STALL_CMPLU_CYC : Marked Group Completion Stall cycles (use edge detect to count #). +-event:0x3e158 counters:2 um:zero minimum:1000 name:PM_MRK_STCX_FAIL : marked stcx failed. +-event:0x30134 counters:2 um:zero minimum:1000 name:PM_MRK_ST_CMPL_INT : marked store complete (data home) with intervention. +-event:0x3f150 counters:2 um:zero minimum:1000 name:PM_MRK_ST_DRAIN_TO_L2DISP_CYC : cycles to drain st from core to L2. +-event:0x3012c counters:2 um:zero minimum:1000 name:PM_MRK_ST_FWD : Marked st forwards. +-event:0x1f150 counters:0 um:zero minimum:1000 name:PM_MRK_ST_L2DISP_TO_CMPL_CYC : cycles from L2 rc disp to l2 rc completion. +-event:0x20138 counters:1 um:zero minimum:1000 name:PM_MRK_ST_NEST : Marked store sent to nest. +-event:0x1c15a counters:0 um:zero minimum:1000 name:PM_MRK_TGT_PREF_TRACK_EFF : Marked target pref track was effective. +-event:0x3c15a counters:2 um:zero minimum:1000 name:PM_MRK_TGT_PREF_TRACK_INEFF : Prefetch tracked was ineffective for marked target. +-event:0x4c15c counters:3 um:zero minimum:1000 name:PM_MRK_TGT_PREF_TRACK_MOD : Prefetch tracked was moderate for marked target. +-event:0x1c15c counters:0 um:zero minimum:1000 name:PM_MRK_TGT_PREF_TRACK_MOD_L2 : Marked target Prefetch Tracked was moderate (source L2). +-event:0x3c15c counters:2 um:zero minimum:1000 name:PM_MRK_TGT_PREF_TRACK_MOD_L3 : Prefetch tracked was moderate (L3 hit) for marked target. +-event:0x30132 counters:2 um:zero minimum:1000 name:PM_MRK_VSU_FIN : vsu (fpu) marked instr finish. ++event:0x40134 counters:3 um:zero minimum:100 name:PM_MRK_INST_TIMEO : marked Instruction finish timeout (instruction lost). ++event:0x20114 counters:1 um:zero minimum:100 name:PM_MRK_L2_RC_DISP : Marked Instruction RC dispatched in L2. ++event:0x3012a counters:2 um:zero minimum:100 name:PM_MRK_L2_RC_DONE : Marked RC done. ++event:0x40116 counters:3 um:zero minimum:100 name:PM_MRK_LARX_FIN : Larx finished . ++event:0x1013f counters:0 um:zero minimum:100 name:PM_MRK_LD_MISS_EXPOSED : Marked Load exposed Miss (use edge detect to count #) ++event:0x1013e counters:0 um:zero minimum:100 name:PM_MRK_LD_MISS_EXPOSED_CYC : Marked Load exposed Miss (use edge detect to count #). ++event:0x4013e counters:3 um:zero minimum:100 name:PM_MRK_LD_MISS_L1_CYC : Marked ld latency. ++event:0x40132 counters:3 um:zero minimum:100 name:PM_MRK_LSU_FIN : lsu marked instr finish. ++event:0xd180 counters:0,1,2,3 um:zero minimum:100 name:PM_MRK_LSU_FLUSH : Flush: (marked) : All Cases42 ++event:0xd188 counters:0,1,2,3 um:zero minimum:100 name:PM_MRK_LSU_FLUSH_LRQ : Flush: (marked) LRQMarked LRQ flushes ++event:0xd18a counters:0,1,2,3 um:zero minimum:100 name:PM_MRK_LSU_FLUSH_SRQ : Flush: (marked) SRQMarked SRQ lhs flushes ++event:0xd184 counters:0,1,2,3 um:zero minimum:100 name:PM_MRK_LSU_FLUSH_ULD : Flush: (marked) Unaligned LoadMarked unaligned load flushes ++event:0xd186 counters:0,1,2,3 um:zero minimum:100 name:PM_MRK_LSU_FLUSH_UST : Flush: (marked) Unaligned StoreMarked unaligned store flushes ++event:0x40164 counters:3 um:zero minimum:100 name:PM_MRK_LSU_REJECT : LSU marked reject (up to 2 per cycle). ++event:0x30164 counters:2 um:zero minimum:100 name:PM_MRK_LSU_REJECT_ERAT_MISS : LSU marked reject due to ERAT (up to 2 per cycle). ++event:0x20112 counters:1 um:zero minimum:100 name:PM_MRK_NTF_FIN : Marked next to finish instruction finished. ++event:0x1d15e counters:0 um:zero minimum:1000 name:PM_MRK_RUN_CYC : Marked run cycles. ++event:0x1d15a counters:0 um:zero minimum:100 name:PM_MRK_SRC_PREF_TRACK_EFF : Marked src pref track was effective. ++event:0x3d15a counters:2 um:zero minimum:100 name:PM_MRK_SRC_PREF_TRACK_INEFF : Prefetch tracked was ineffective for marked src. ++event:0x4d15c counters:3 um:zero minimum:100 name:PM_MRK_SRC_PREF_TRACK_MOD : Prefetch tracked was moderate for marked src. ++event:0x1d15c counters:0 um:zero minimum:100 name:PM_MRK_SRC_PREF_TRACK_MOD_L2 : Marked src Prefetch Tracked was moderate (source L2). ++event:0x3d15c counters:2 um:zero minimum:100 name:PM_MRK_SRC_PREF_TRACK_MOD_L3 : Prefetch tracked was moderate (L3 hit) for marked src. ++event:0x3013e counters:2 um:zero minimum:100 name:PM_MRK_STALL_CMPLU_CYC : Marked Group Completion Stall cycles (use edge detect to count #). ++event:0x3e158 counters:2 um:zero minimum:100 name:PM_MRK_STCX_FAIL : marked stcx failed. ++event:0x30134 counters:2 um:zero minimum:100 name:PM_MRK_ST_CMPL_INT : marked store complete (data home) with intervention. ++event:0x3f150 counters:2 um:zero minimum:100 name:PM_MRK_ST_DRAIN_TO_L2DISP_CYC : cycles to drain st from core to L2. ++event:0x3012c counters:2 um:zero minimum:100 name:PM_MRK_ST_FWD : Marked st forwards. ++event:0x1f150 counters:0 um:zero minimum:100 name:PM_MRK_ST_L2DISP_TO_CMPL_CYC : cycles from L2 rc disp to l2 rc completion. ++event:0x20138 counters:1 um:zero minimum:100 name:PM_MRK_ST_NEST : Marked store sent to nest. ++event:0x1c15a counters:0 um:zero minimum:100 name:PM_MRK_TGT_PREF_TRACK_EFF : Marked target pref track was effective. ++event:0x3c15a counters:2 um:zero minimum:100 name:PM_MRK_TGT_PREF_TRACK_INEFF : Prefetch tracked was ineffective for marked target. ++event:0x4c15c counters:3 um:zero minimum:100 name:PM_MRK_TGT_PREF_TRACK_MOD : Prefetch tracked was moderate for marked target. ++event:0x1c15c counters:0 um:zero minimum:100 name:PM_MRK_TGT_PREF_TRACK_MOD_L2 : Marked target Prefetch Tracked was moderate (source L2). ++event:0x3c15c counters:2 um:zero minimum:100 name:PM_MRK_TGT_PREF_TRACK_MOD_L3 : Prefetch tracked was moderate (L3 hit) for marked target. ++event:0x30132 counters:2 um:zero minimum:100 name:PM_MRK_VSU_FIN : vsu (fpu) marked instr finish. + event:0x3d15e counters:2 um:zero minimum:10000 name:PM_MULT_MRK : mult marked instr. + event:0x20b0 counters:0,1,2,3 um:zero minimum:10000 name:PM_NESTED_TEND : Completion time nested tend + event:0x3006e counters:2 um:zero minimum:10000 name:PM_NEST_REF_CLK : Nest reference clocks. +commit a7d08172d5738f6e9b3e3ea68e585c1585f5ca21 +Author: Maynard Johnson +Date: Fri Nov 21 15:41:55 2014 -0600 + + Add support for IBM Power event codes longer than sizeof int + + A small number of events on newer IBM Power processors have event codes + that are larger than sizeof(int). Rather than change the width of the + event code everywhere to be a long int (which would include having to + change the sample file format), we have defined some internal-use-only + unit masks for those events. These unit masks are not shown in the ophelp + output, and IBM Power users should never use them in event specifications; + instead, they should use the usual 'null' unit mask value of '0x0' in event + specifications -- e.g., + PM_L1MISS_LAT_EXC_256:0x0:0:1 + + See libpe_utils/op_pe_utils.cpp:_get_event_code for how these unit masks are + used. + + Signed-off-by: Maynard Johnson + +diff --git a/events/ppc64/power8/events b/events/ppc64/power8/events +index cc1163a..012ca89 100644 +--- a/events/ppc64/power8/events ++++ b/events/ppc64/power8/events +@@ -451,10 +451,10 @@ event:0x30a8 counters:0,1,2,3 um:zero minimum:10000 name:PM_ISU_REJ_VS0 : VS0 IS + event:0x30aa counters:0,1,2,3 um:zero minimum:10000 name:PM_ISU_REJ_VS1 : VS1 ISU reject + event:0x38a8 counters:0,1,2,3 um:zero minimum:10000 name:PM_ISU_REJ_VSU : ISU + event:0x30b8 counters:0,1,2,3 um:zero minimum:10000 name:PM_ISYNC : Isync count per thread +-event:0x200301ea counters:2 um:zero minimum:10000 name:PM_L1MISS_LAT_EXC_1024 : Reload latency exceeded 1024 cyc +-event:0x200401ec counters:3 um:zero minimum:10000 name:PM_L1MISS_LAT_EXC_2048 : Reload latency exceeded 2048 cyc +-event:0x200101e8 counters:0 um:zero minimum:10000 name:PM_L1MISS_LAT_EXC_256 : Reload latency exceeded 256 cyc +-event:0x200201e6 counters:1 um:zero minimum:10000 name:PM_L1MISS_LAT_EXC_32 : Reload latency exceeded 32 cyc ++event:0x200301ea counters:2 um:L1_latency minimum:10000 name:PM_L1MISS_LAT_EXC_1024 : Reload latency exceeded 1024 cyc ++event:0x200401ec counters:3 um:L1_latency minimum:10000 name:PM_L1MISS_LAT_EXC_2048 : Reload latency exceeded 2048 cyc ++event:0x200101e8 counters:0 um:L1_latency minimum:10000 name:PM_L1MISS_LAT_EXC_256 : Reload latency exceeded 256 cyc ++event:0x200201e6 counters:1 um:L1_latency minimum:10000 name:PM_L1MISS_LAT_EXC_32 : Reload latency exceeded 32 cyc + event:0x26086 counters:1 um:zero minimum:10000 name:PM_L1PF_L2MEMACC : valid when first beat of data comes in for an L1pref where data came from mem(or L4) + event:0x1002c counters:0 um:zero minimum:10000 name:PM_L1_DCACHE_RELOADED_ALL : L1 data cache reloaded for demand or prefetch . + event:0x408c counters:0,1,2,3 um:zero minimum:10000 name:PM_L1_DEMAND_WRITE : Instruction Demand sectors wriittent into IL1 +@@ -879,10 +879,10 @@ event:0x10054 counters:0 um:zero minimum:10000 name:PM_PUMP_CPRED : Pump predict + event:0x40052 counters:3 um:zero minimum:10000 name:PM_PUMP_MPRED : Pump Mis prediction Counts across all types of pumpsfor all data types excluding data prefetch (demand load,inst prefetch,inst fetch,xlate). + event:0x16081 counters:0 um:zero minimum:10000 name:PM_RC0_ALLOC : 0.0 + event:0x16080 counters:0 um:zero minimum:10000 name:PM_RC0_BUSY : RC mach 0 Busy. Used by PMU to sample ave RC livetime(mach0 used as sample point) +-event:0x200301ea counters:2 um:zero minimum:10000 name:PM_RC_LIFETIME_EXC_1024 : Reload latency exceeded 1024 cyc +-event:0x200401ec counters:3 um:zero minimum:10000 name:PM_RC_LIFETIME_EXC_2048 : Threshold counter exceeded a value of 2048 +-event:0x200101e8 counters:0 um:zero minimum:10000 name:PM_RC_LIFETIME_EXC_256 : Threshold counter exceed a count of 256 +-event:0x200201e6 counters:1 um:zero minimum:10000 name:PM_RC_LIFETIME_EXC_32 : Reload latency exceeded 32 cyc ++event:0x200301ea counters:2 um:rc_machine minimum:10000 name:PM_RC_LIFETIME_EXC_1024 : Reload latency exceeded 1024 cyc ++event:0x200401ec counters:3 um:rc_machine minimum:10000 name:PM_RC_LIFETIME_EXC_2048 : Threshold counter exceeded a value of 2048 ++event:0x200101e8 counters:0 um:rc_machine minimum:10000 name:PM_RC_LIFETIME_EXC_256 : Threshold counter exceed a count of 256 ++event:0x200201e6 counters:1 um:rc_machine minimum:10000 name:PM_RC_LIFETIME_EXC_32 : Reload latency exceeded 32 cyc + event:0x36088 counters:2 um:zero minimum:10000 name:PM_RC_USAGE : Continuous 16 cycle(2to1) window where this signals rotates thru sampling each L2 RC machine busy. PMU uses this wave to then do 16 cyc count to sample total number of machs running + event:0x34808e counters:3 um:zero minimum:10000 name:PM_RD_CLEARING_SC : rd clearing sc + event:0x34808c counters:3 um:zero minimum:10000 name:PM_RD_FORMING_SC : rd forming sc +diff --git a/events/ppc64/power8/unit_masks b/events/ppc64/power8/unit_masks +index 988dd41..203af97 100644 +--- a/events/ppc64/power8/unit_masks ++++ b/events/ppc64/power8/unit_masks +@@ -5,5 +5,13 @@ + # + # ppc64 POWER8 possible unit masks + # ++# NOTE: The 'rc_machine' and 'L1_latency' unit masks are for internal use only, ++# to workaround oprofile's 32-bit limitation for event codes. ++# See libpe_utils/op_pe_utils.cpp:_get_event_code for how these codes are ++# used. + name:zero type:mandatory default:0x0 + 0x0 No unit mask ++name:rc_machine type:mandatory default:0xde ++ 0xde Thresholdable start/stop for rc machine for sampled instruction ++name:L1_latency type:mandatory default:0x67 ++ 0x67 Thresholdable start/stop for L1 sampled instruction load miss/reload +diff --git a/libop/op_events.c b/libop/op_events.c +index 8bfd3d2..29dc2f3 100644 +--- a/libop/op_events.c ++++ b/libop/op_events.c +@@ -1081,12 +1081,21 @@ static int _is_um_valid_bitmask(struct op_event * event, u32 passed_um) + return retval; + } + +-int op_check_events(int ctr, u32 nr, u32 um, op_cpu cpu_type) ++static int _is_ppc64_cpu_type(op_cpu cpu_type) { ++ char const * cpu_name = op_get_cpu_name(cpu_type); ++ if (strncmp(cpu_name, "ppc64/power", strlen("ppc64/power")) == 0) ++ return 1; ++ else ++ return 0; ++} ++ ++int op_check_events(char * evt_name, int ctr, u32 nr, u32 um, op_cpu cpu_type) + { + int ret = OP_INVALID_EVENT; + size_t i; + u32 ctr_mask = 1 << ctr; + struct list_head * pos; ++ int ibm_power_proc = _is_ppc64_cpu_type(cpu_type); + + load_events(cpu_type); + +@@ -1095,6 +1104,11 @@ int op_check_events(int ctr, u32 nr, u32 um, op_cpu cpu_type) + if (event->val != nr) + continue; + ++ // Why do we have to do this, since event codes are supposed to be unique? ++ // See the big comment below. ++ if (ibm_power_proc && strcmp(evt_name, event->name)) ++ continue; ++ + ret = OP_OK_EVENT; + + if ((event->counter_mask & ctr_mask) == 0) +@@ -1108,7 +1122,28 @@ int op_check_events(int ctr, u32 nr, u32 um, op_cpu cpu_type) + if (event->unit->um[i].value == um) + break; + } +- if (i == event->unit->num) ++ /* A small number of events on the IBM Power8 processor have real event ++ * codes that are larger than sizeof(int). Rather than change the width of ++ * the event code everywhere to be a long int (which would include having to ++ * change the sample file format), we have defined some internal-use-only ++ * unit masks for those events. In oprofile's power8 events file, we have ++ * truncated those event codes to integer size, and the truncated bits are ++ * used as a unit mask value which is ORed into the event code by ++ * libpe_utils/op_pe_utils.cpp:_get_event_code(). This technique allowed ++ * us to handle this situation with minimal code perturbation. The one ++ * downside is that the truncated event codes are not unique. So in this ++ * function, where we're searching for events by 'nr' (i.e., the event code), ++ * we have to also make sure the name matches. ++ * ++ * If the user gives us an event specification such as: ++ * PM_L1MISS_LAT_EXC_256:0x0:1:1 ++ * the above code will actually find a non-zero unit mask for this event and ++ * we'd normally fail at this point since the user passed '0x0' for a unit mask. ++ * But we don't expose these internal-use-only UMs to the user, so there's ++ * no way for them to know about it or to try to use it in their event spec; ++ * thus, we handle it below. ++ */ ++ if ((i == event->unit->num) && !((um == 0) && ibm_power_proc)) + ret |= OP_INVALID_UM; + } + +diff --git a/libop/op_events.h b/libop/op_events.h +index be609f7..ec345e5 100644 +--- a/libop/op_events.h ++++ b/libop/op_events.h +@@ -113,7 +113,7 @@ enum op_event_check { + * + * \sa op_cpu, OP_EVENTS_OK + */ +-int op_check_events(int ctr, u32 event, u32 um, op_cpu cpu_type); ++int op_check_events(char * name, int ctr, u32 event, u32 um, op_cpu cpu_type); + + /** + * free memory used by any call to above function. Need to be called only once +diff --git a/libpe_utils/op_pe_utils.cpp b/libpe_utils/op_pe_utils.cpp +index 8c69894..c5b6ee7 100644 +--- a/libpe_utils/op_pe_utils.cpp ++++ b/libpe_utils/op_pe_utils.cpp +@@ -379,7 +379,7 @@ out: + static void _get_event_code(operf_event_t * event, op_cpu cpu_type) + { + FILE * fp; +- char oprof_event_code[9]; ++ char oprof_event_code[11]; + string command; + u64 base_code, config; + char buf[20]; +@@ -412,7 +412,6 @@ static void _get_event_code(operf_event_t * event, op_cpu cpu_type) + + + #if defined(__i386__) || defined(__x86_64__) +- char mask[OP_MAX_UM_NAME_LEN]; + // Setup EventSelct[11:8] field for AMD + const char * vendor_AMD = "AuthenticAMD"; + if (op_is_cpu_vendor((char *)vendor_AMD)) { +@@ -422,8 +421,10 @@ static void _get_event_code(operf_event_t * event, op_cpu cpu_type) + + // Setup EventSelct[7:0] field + config |= base_code & 0xFFULL; +- +- // Setup unitmask field ++#endif ++#if defined(__i386__) || defined(__x86_64__) || defined(__powerpc64__) ++ char mask[OP_MAX_UM_NAME_LEN]; ++// Setup unitmask field + handle_named_um: + if (event->um_name[0]) { + command = OP_BINDIR; +@@ -489,7 +490,12 @@ handle_named_um: + strncpy(event->um_name, mask, OP_MAX_UM_NAME_LEN - 1); + goto handle_named_um; + } ++#if defined(__powerpc64__) ++ config = base_code; ++ config |= ((event->evt_um & 0xFFULL) << 32); ++#else + config |= ((event->evt_um & 0xFFULL) << 8); ++#endif + } else { + config |= ((event->evt_um & 0xFFULL) << 8); + } +@@ -505,6 +511,7 @@ handle_named_um: + } + } + event->evt_code = config; ++ cverb << vdebug << "Final event code is " << hex << event->evt_code << endl; + } + + #if PPC64_ARCH +diff --git a/utils/ophelp.c b/utils/ophelp.c +index e38e417..a80fec8 100644 +--- a/utils/ophelp.c ++++ b/utils/ophelp.c +@@ -180,7 +180,7 @@ static void check_event(struct parsed_event * pev, + if (pev->unit_mask_name) + ret = 0; + else +- ret = op_check_events(0, event->val, pev->unit_mask, cpu_type); ++ ret = op_check_events(pev->name, 0, event->val, pev->unit_mask, cpu_type); + + if (ret & OP_INVALID_UM) { + fprintf(stderr, "Invalid unit mask 0x%x for event %s\n", +commit 34715734fd6f4b44f32206541c8a2500514c9922 +Author: Carl E. Love +Date: Fri Nov 13 12:27:56 2015 -0800 + + Remove Powerpc OProfile events the kernel will reject + + This patch comments out a number of events with "## note 1" to indicate + that the events are not supported. A "note 1" is added to the beginning + of the file explaining the issue. The issue is the events require the + setting of a register by the kernel when setting up to measure the + events that is only writable by the hypervisor. Currently, there is no + API allowing the kernel to request the required field of the register be + changed by the hypervisor. The events can be re-enabled if an API is + created for the OS to request the hypervisor set the bits. + + Signed-off-by: Carl Love + +diff --git a/events/ppc64/power8/events b/events/ppc64/power8/events +index 012ca89..851299d 100644 +--- a/events/ppc64/power8/events ++++ b/events/ppc64/power8/events +@@ -7,6 +7,20 @@ + + include:ppc64/architected_events_v1 + ++#note 1. 11/12/2015 ++# ++# These event requires the cache selector bits to be set to a non-zero ++# value in the processor performance counter setup register. On Power 8, this ++# register is only writable by the hypervisor. So the kernel must reject any ++# event where the lower three cache selector bits (bits 22:20) are not equal ++# to 0. If/when an API is implemented to allow the kernel to request the ++# hypervisor write the register with the required value, these events can be ++# re-added to the list of supported events. The issue is documented in the ++# powerpc kernel file arch/powerpc/perf/power8-pmu.c in function power8_get_constraint() ++# where the cache bits are ANDed with 0x7 if the unit is between 6 and 9. If ++# cache bits are not zero, the function returns -1 to reject the event. ++ ++ + event:0x1f05e counters:0 um:zero minimum:100000 name:PM_1LPAR_CYC : Number of cycles in single lpar mode. + event:0x2006e counters:1 um:zero minimum:10000 name:PM_2LPAR_CYC : Number of cycles in 2 lpar mode. + event:0x4e05e counters:3 um:zero minimum:100000 name:PM_4LPAR_CYC : Number of cycles in 4 LPAR mode. +@@ -89,49 +103,49 @@ event:0x4d012 counters:3 um:zero minimum:10000 name:PM_CMPLU_STALL_VECTOR_LONG : + event:0x2d012 counters:1 um:zero minimum:10000 name:PM_CMPLU_STALL_VSU : Completion stall due to VSU instruction. + event:0x16083 counters:0 um:zero minimum:10000 name:PM_CO0_ALLOC : 0.0 + event:0x16082 counters:0 um:zero minimum:10000 name:PM_CO0_BUSY : CO mach 0 Busy. Used by PMU to sample ave RC livetime(mach0 used as sample point) +-event:0x517082 counters:0 um:zero minimum:10000 name:PM_CO_DISP_FAIL : CO dispatch failed due to all CO machines being busy +-event:0x527084 counters:1 um:zero minimum:10000 name:PM_CO_TM_SC_FOOTPRINT : L2 did a cleanifdirty CO to the L3 (ie created an SC line in the L3) ++## note 1 event:0x517082 counters:0 um:zero minimum:10000 name:PM_CO_DISP_FAIL : CO dispatch failed due to all CO machines being busy ++## note 1 event:0x527084 counters:1 um:zero minimum:10000 name:PM_CO_TM_SC_FOOTPRINT : L2 did a cleanifdirty CO to the L3 (ie created an SC line in the L3) + event:0x3608a counters:2 um:zero minimum:10000 name:PM_CO_USAGE : Continuous 16 cycle(2to1) window where this signals rotates thru sampling each L2 CO machine busy. PMU uses this wave to then do 16 cyc count to sample total number of machs running + event:0x40066 counters:3 um:zero minimum:10000 name:PM_CRU_FIN : IFU Finished a (non-branch) instruction. +-event:0x61c050 counters:0 um:zero minimum:10000 name:PM_DATA_ALL_CHIP_PUMP_CPRED : Initial and Final Pump Scope and data sourced across this scope was chip pump (prediction=correct) for a demand load +-event:0x64c048 counters:3 um:zero minimum:10000 name:PM_DATA_ALL_FROM_DL2L3_MOD : The processor's data cache was reloaded with Modified (M) data from another chip's L2 or L3 on a different Node or Group (Distant), as this chip due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 +-event:0x63c048 counters:2 um:zero minimum:10000 name:PM_DATA_ALL_FROM_DL2L3_SHR : The processor's data cache was reloaded with Shared (S) data from another chip's L2 or L3 on a different Node or Group (Distant), as this chip due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 +-event:0x63c04c counters:2 um:zero minimum:10000 name:PM_DATA_ALL_FROM_DL4 : The processor's data cache was reloaded from another chip's L4 on a different Node or Group (Distant) due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 +-event:0x64c04c counters:3 um:zero minimum:10000 name:PM_DATA_ALL_FROM_DMEM : The processor's data cache was reloaded from another chip's memory on the same Node or Group (Distant) due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 +-event:0x61c042 counters:0 um:zero minimum:10000 name:PM_DATA_ALL_FROM_L2 : The processor's data cache was reloaded from local core's L2 due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 +-event:0x64c046 counters:3 um:zero minimum:10000 name:PM_DATA_ALL_FROM_L21_MOD : The processor's data cache was reloaded with Modified (M) data from another core's L2 on the same chip due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 +-event:0x63c046 counters:2 um:zero minimum:10000 name:PM_DATA_ALL_FROM_L21_SHR : The processor's data cache was reloaded with Shared (S) data from another core's L2 on the same chip due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 +-event:0x61c04e counters:0 um:zero minimum:10000 name:PM_DATA_ALL_FROM_L2MISS_MOD : The processor's data cache was reloaded from a localtion other than the local core's L2 due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 +-event:0x63c040 counters:2 um:zero minimum:10000 name:PM_DATA_ALL_FROM_L2_DISP_CONFLICT_LDHITST : The processor's data cache was reloaded from local core's L2 with load hit store conflict due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 +-event:0x64c040 counters:3 um:zero minimum:10000 name:PM_DATA_ALL_FROM_L2_DISP_CONFLICT_OTHER : The processor's data cache was reloaded from local core's L2 with dispatch conflict due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 ++## note 1 event:0x61c050 counters:0 um:zero minimum:10000 name:PM_DATA_ALL_CHIP_PUMP_CPRED : Initial and Final Pump Scope and data sourced across this scope was chip pump (prediction=correct) for a demand load ++## note 1 event:0x64c048 counters:3 um:zero minimum:10000 name:PM_DATA_ALL_FROM_DL2L3_MOD : The processor's data cache was reloaded with Modified (M) data from another chip's L2 or L3 on a different Node or Group (Distant), as this chip due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 ++## note 1 event:0x63c048 counters:2 um:zero minimum:10000 name:PM_DATA_ALL_FROM_DL2L3_SHR : The processor's data cache was reloaded with Shared (S) data from another chip's L2 or L3 on a different Node or Group (Distant), as this chip due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 ++## note 1 event:0x63c04c counters:2 um:zero minimum:10000 name:PM_DATA_ALL_FROM_DL4 : The processor's data cache was reloaded from another chip's L4 on a different Node or Group (Distant) due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 ++## note 1 event:0x64c04c counters:3 um:zero minimum:10000 name:PM_DATA_ALL_FROM_DMEM : The processor's data cache was reloaded from another chip's memory on the same Node or Group (Distant) due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 ++## note 1 event:0x61c042 counters:0 um:zero minimum:10000 name:PM_DATA_ALL_FROM_L2 : The processor's data cache was reloaded from local core's L2 due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 ++## note 1 event:0x64c046 counters:3 um:zero minimum:10000 name:PM_DATA_ALL_FROM_L21_MOD : The processor's data cache was reloaded with Modified (M) data from another core's L2 on the same chip due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 ++## note 1 event:0x63c046 counters:2 um:zero minimum:10000 name:PM_DATA_ALL_FROM_L21_SHR : The processor's data cache was reloaded with Shared (S) data from another core's L2 on the same chip due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 ++## note 1 event:0x61c04e counters:0 um:zero minimum:10000 name:PM_DATA_ALL_FROM_L2MISS_MOD : The processor's data cache was reloaded from a localtion other than the local core's L2 due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 ++## note 1 event:0x63c040 counters:2 um:zero minimum:10000 name:PM_DATA_ALL_FROM_L2_DISP_CONFLICT_LDHITST : The processor's data cache was reloaded from local core's L2 with load hit store conflict due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 ++## note 1 event:0x64c040 counters:3 um:zero minimum:10000 name:PM_DATA_ALL_FROM_L2_DISP_CONFLICT_OTHER : The processor's data cache was reloaded from local core's L2 with dispatch conflict due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 + event:0x62c040 counters:1 um:zero minimum:10000 name:PM_DATA_ALL_FROM_L2_MEPF : The processor's data cache was reloaded from local core's L2 hit without dispatch conflicts on Mepf state due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 +-event:0x61c040 counters:0 um:zero minimum:10000 name:PM_DATA_ALL_FROM_L2_NO_CONFLICT : The processor's data cache was reloaded from local core's L2 without conflict due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 +-event:0x64c042 counters:3 um:zero minimum:10000 name:PM_DATA_ALL_FROM_L3 : The processor's data cache was reloaded from local core's L3 due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 +-event:0x64c044 counters:3 um:zero minimum:10000 name:PM_DATA_ALL_FROM_L31_ECO_MOD : The processor's data cache was reloaded with Modified (M) data from another core's ECO L3 on the same chip due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 +-event:0x63c044 counters:2 um:zero minimum:10000 name:PM_DATA_ALL_FROM_L31_ECO_SHR : The processor's data cache was reloaded with Shared (S) data from another core's ECO L3 on the same chip due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 +-event:0x62c044 counters:1 um:zero minimum:10000 name:PM_DATA_ALL_FROM_L31_MOD : The processor's data cache was reloaded with Modified (M) data from another core's L3 on the same chip due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 +-event:0x61c046 counters:0 um:zero minimum:10000 name:PM_DATA_ALL_FROM_L31_SHR : The processor's data cache was reloaded with Shared (S) data from another core's L3 on the same chip due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 +-event:0x64c04e counters:3 um:zero minimum:10000 name:PM_DATA_ALL_FROM_L3MISS_MOD : The processor's data cache was reloaded from a localtion other than the local core's L3 due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 +-event:0x63c042 counters:2 um:zero minimum:10000 name:PM_DATA_ALL_FROM_L3_DISP_CONFLICT : The processor's data cache was reloaded from local core's L3 with dispatch conflict due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 +-event:0x62c042 counters:1 um:zero minimum:10000 name:PM_DATA_ALL_FROM_L3_MEPF : The processor's data cache was reloaded from local core's L3 without dispatch conflicts hit on Mepf state due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 +-event:0x61c044 counters:0 um:zero minimum:10000 name:PM_DATA_ALL_FROM_L3_NO_CONFLICT : The processor's data cache was reloaded from local core's L3 without conflict due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 +-event:0x61c04c counters:0 um:zero minimum:10000 name:PM_DATA_ALL_FROM_LL4 : The processor's data cache was reloaded from the local chip's L4 cache due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 +-event:0x62c048 counters:1 um:zero minimum:10000 name:PM_DATA_ALL_FROM_LMEM : The processor's data cache was reloaded from the local chip's Memory due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 +-event:0x62c04c counters:1 um:zero minimum:10000 name:PM_DATA_ALL_FROM_MEMORY : The processor's data cache was reloaded from a memory location including L4 from local remote or distant due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 +-event:0x64c04a counters:3 um:zero minimum:10000 name:PM_DATA_ALL_FROM_OFF_CHIP_CACHE : The processor's data cache was reloaded either shared or modified data from another core's L2/L3 on a different chip (remote or distant) due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 +-event:0x61c048 counters:0 um:zero minimum:10000 name:PM_DATA_ALL_FROM_ON_CHIP_CACHE : The processor's data cache was reloaded either shared or modified data from another core's L2/L3 on the same chip due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 +-event:0x62c046 counters:1 um:zero minimum:10000 name:PM_DATA_ALL_FROM_RL2L3_MOD : The processor's data cache was reloaded with Modified (M) data from another chip's L2 or L3 on the same Node or Group (Remote), as this chip due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 +-event:0x61c04a counters:0 um:zero minimum:10000 name:PM_DATA_ALL_FROM_RL2L3_SHR : The processor's data cache was reloaded with Shared (S) data from another chip's L2 or L3 on the same Node or Group (Remote), as this chip due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 +-event:0x62c04a counters:1 um:zero minimum:10000 name:PM_DATA_ALL_FROM_RL4 : The processor's data cache was reloaded from another chip's L4 on the same Node or Group ( Remote) due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 +-event:0x63c04a counters:2 um:zero minimum:10000 name:PM_DATA_ALL_FROM_RMEM : The processor's data cache was reloaded from another chip's memory on the same Node or Group ( Remote) due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 +-event:0x62c050 counters:1 um:zero minimum:10000 name:PM_DATA_ALL_GRP_PUMP_CPRED : Initial and Final Pump Scope and data sourced across this scope was group pump for a demand load +-event:0x62c052 counters:1 um:zero minimum:10000 name:PM_DATA_ALL_GRP_PUMP_MPRED : Final Pump Scope(Group) to get data sourced, ended up larger than Initial Pump Scope OR Final Pump Scope(Group) got data from source that was at smaller scope(Chip) Final pump was group pump and initial pump was chip or final and initial pump was gro +-event:0x61c052 counters:0 um:zero minimum:10000 name:PM_DATA_ALL_GRP_PUMP_MPRED_RTY : Final Pump Scope(Group) to get data sourced, ended up larger than Initial Pump Scope (Chip) Final pump was group pump and initial pump was chip pumpfor a demand load +-event:0x61c054 counters:0 um:zero minimum:10000 name:PM_DATA_ALL_PUMP_CPRED : Pump prediction correct. Counts across all types of pumps for a demand load +-event:0x64c052 counters:3 um:zero minimum:10000 name:PM_DATA_ALL_PUMP_MPRED : Pump Mis prediction Counts across all types of pumpsfor a demand load +-event:0x63c050 counters:2 um:zero minimum:10000 name:PM_DATA_ALL_SYS_PUMP_CPRED : Initial and Final Pump Scope and data sourced across this scope was system pump for a demand load +-event:0x63c052 counters:2 um:zero minimum:10000 name:PM_DATA_ALL_SYS_PUMP_MPRED : Final Pump Scope(system) to get data sourced, ended up larger than Initial Pump Scope(Chip/Group) OR Final Pump Scope(system) got data from source that was at smaller scope(Chip/group) Final pump was system pump and initial pump was chip or group or +-event:0x64c050 counters:3 um:zero minimum:10000 name:PM_DATA_ALL_SYS_PUMP_MPRED_RTY : Final Pump Scope(system) to get data sourced, ended up larger than Initial Pump Scope (Chip or Group) for a demand load ++## note 1 event:0x61c040 counters:0 um:zero minimum:10000 name:PM_DATA_ALL_FROM_L2_NO_CONFLICT : The processor's data cache was reloaded from local core's L2 without conflict due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 ++## note 1 event:0x64c042 counters:3 um:zero minimum:10000 name:PM_DATA_ALL_FROM_L3 : The processor's data cache was reloaded from local core's L3 due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 ++## note 1 event:0x64c044 counters:3 um:zero minimum:10000 name:PM_DATA_ALL_FROM_L31_ECO_MOD : The processor's data cache was reloaded with Modified (M) data from another core's ECO L3 on the same chip due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 ++## note 1 event:0x63c044 counters:2 um:zero minimum:10000 name:PM_DATA_ALL_FROM_L31_ECO_SHR : The processor's data cache was reloaded with Shared (S) data from another core's ECO L3 on the same chip due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 ++## note 1 event:0x62c044 counters:1 um:zero minimum:10000 name:PM_DATA_ALL_FROM_L31_MOD : The processor's data cache was reloaded with Modified (M) data from another core's L3 on the same chip due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 ++## note 1 event:0x61c046 counters:0 um:zero minimum:10000 name:PM_DATA_ALL_FROM_L31_SHR : The processor's data cache was reloaded with Shared (S) data from another core's L3 on the same chip due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 ++## note 1 event:0x64c04e counters:3 um:zero minimum:10000 name:PM_DATA_ALL_FROM_L3MISS_MOD : The processor's data cache was reloaded from a localtion other than the local core's L3 due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 ++## note 1 event:0x63c042 counters:2 um:zero minimum:10000 name:PM_DATA_ALL_FROM_L3_DISP_CONFLICT : The processor's data cache was reloaded from local core's L3 with dispatch conflict due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 ++## note 1 event:0x62c042 counters:1 um:zero minimum:10000 name:PM_DATA_ALL_FROM_L3_MEPF : The processor's data cache was reloaded from local core's L3 without dispatch conflicts hit on Mepf state due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 ++## note 1 event:0x61c044 counters:0 um:zero minimum:10000 name:PM_DATA_ALL_FROM_L3_NO_CONFLICT : The processor's data cache was reloaded from local core's L3 without conflict due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 ++## note 1 event:0x61c04c counters:0 um:zero minimum:10000 name:PM_DATA_ALL_FROM_LL4 : The processor's data cache was reloaded from the local chip's L4 cache due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 ++## note 1 event:0x62c048 counters:1 um:zero minimum:10000 name:PM_DATA_ALL_FROM_LMEM : The processor's data cache was reloaded from the local chip's Memory due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 ++## note 1 event:0x62c04c counters:1 um:zero minimum:10000 name:PM_DATA_ALL_FROM_MEMORY : The processor's data cache was reloaded from a memory location including L4 from local remote or distant due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 ++## note 1 event:0x64c04a counters:3 um:zero minimum:10000 name:PM_DATA_ALL_FROM_OFF_CHIP_CACHE : The processor's data cache was reloaded either shared or modified data from another core's L2/L3 on a different chip (remote or distant) due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 ++## note 1 event:0x61c048 counters:0 um:zero minimum:10000 name:PM_DATA_ALL_FROM_ON_CHIP_CACHE : The processor's data cache was reloaded either shared or modified data from another core's L2/L3 on the same chip due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 ++## note 1 event:0x62c046 counters:1 um:zero minimum:10000 name:PM_DATA_ALL_FROM_RL2L3_MOD : The processor's data cache was reloaded with Modified (M) data from another chip's L2 or L3 on the same Node or Group (Remote), as this chip due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 ++## note 1 event:0x61c04a counters:0 um:zero minimum:10000 name:PM_DATA_ALL_FROM_RL2L3_SHR : The processor's data cache was reloaded with Shared (S) data from another chip's L2 or L3 on the same Node or Group (Remote), as this chip due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 ++## note 1 event:0x62c04a counters:1 um:zero minimum:10000 name:PM_DATA_ALL_FROM_RL4 : The processor's data cache was reloaded from another chip's L4 on the same Node or Group ( Remote) due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 ++## note 1 event:0x63c04a counters:2 um:zero minimum:10000 name:PM_DATA_ALL_FROM_RMEM : The processor's data cache was reloaded from another chip's memory on the same Node or Group ( Remote) due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 ++## note 1 event:0x62c050 counters:1 um:zero minimum:10000 name:PM_DATA_ALL_GRP_PUMP_CPRED : Initial and Final Pump Scope and data sourced across this scope was group pump for a demand load ++## note 1 event:0x62c052 counters:1 um:zero minimum:10000 name:PM_DATA_ALL_GRP_PUMP_MPRED : Final Pump Scope(Group) to get data sourced, ended up larger than Initial Pump Scope OR Final Pump Scope(Group) got data from source that was at smaller scope(Chip) Final pump was group pump and initial pump was chip or final and initial pump was gro ++## note 1 event:0x61c052 counters:0 um:zero minimum:10000 name:PM_DATA_ALL_GRP_PUMP_MPRED_RTY : Final Pump Scope(Group) to get data sourced, ended up larger than Initial Pump Scope (Chip) Final pump was group pump and initial pump was chip pumpfor a demand load ++## note 1 event:0x61c054 counters:0 um:zero minimum:10000 name:PM_DATA_ALL_PUMP_CPRED : Pump prediction correct. Counts across all types of pumps for a demand load ++## note 1 event:0x64c052 counters:3 um:zero minimum:10000 name:PM_DATA_ALL_PUMP_MPRED : Pump Mis prediction Counts across all types of pumpsfor a demand load ++## note 1 event:0x63c050 counters:2 um:zero minimum:10000 name:PM_DATA_ALL_SYS_PUMP_CPRED : Initial and Final Pump Scope and data sourced across this scope was system pump for a demand load ++## note 1 event:0x63c052 counters:2 um:zero minimum:10000 name:PM_DATA_ALL_SYS_PUMP_MPRED : Final Pump Scope(system) to get data sourced, ended up larger than Initial Pump Scope(Chip/Group) OR Final Pump Scope(system) got data from source that was at smaller scope(Chip/group) Final pump was system pump and initial pump was chip or group or ++## note 1 event:0x64c050 counters:3 um:zero minimum:10000 name:PM_DATA_ALL_SYS_PUMP_MPRED_RTY : Final Pump Scope(system) to get data sourced, ended up larger than Initial Pump Scope (Chip or Group) for a demand load + event:0x1c050 counters:0 um:zero minimum:10000 name:PM_DATA_CHIP_PUMP_CPRED : Initial and Final Pump Scope and data sourced across this scope was chip pump (prediction=correct) for a demand load. + event:0x4c048 counters:3 um:zero minimum:10000 name:PM_DATA_FROM_DL2L3_MOD : The processor's data cache was reloaded with Modified (M) data from another chip's L2 or L3 on a different Node or Group (Distant), as this chip due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1. + event:0x3c048 counters:2 um:zero minimum:10000 name:PM_DATA_FROM_DL2L3_SHR : The processor's data cache was reloaded with Shared (S) data from another chip's L2 or L3 on a different Node or Group (Distant), as this chip due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1. +@@ -430,11 +444,11 @@ event:0x25046 counters:1 um:zero minimum:10000 name:PM_IPTEG_FROM_RL2L3_MOD : A + event:0x1504a counters:0 um:zero minimum:10000 name:PM_IPTEG_FROM_RL2L3_SHR : A Page Table Entry was loaded into the TLB with Shared (S) data from another chip's L2 or L3 on the same Node or Group (Remote), as this chip due to a instruction side request. + event:0x2504a counters:1 um:zero minimum:10000 name:PM_IPTEG_FROM_RL4 : A Page Table Entry was loaded into the TLB from another chip's L4 on the same Node or Group ( Remote) due to a instruction side request. + event:0x3504a counters:2 um:zero minimum:10000 name:PM_IPTEG_FROM_RMEM : A Page Table Entry was loaded into the TLB from another chip's memory on the same Node or Group ( Remote) due to a instruction side request. +-event:0x617082 counters:0 um:zero minimum:10000 name:PM_ISIDE_DISP : All i-side dispatch attempts +-event:0x627084 counters:1 um:zero minimum:10000 name:PM_ISIDE_DISP_FAIL : All i-side dispatch attempts that failed due to a addr collision with another machine +-event:0x627086 counters:1 um:zero minimum:10000 name:PM_ISIDE_DISP_FAIL_OTHER : All i-side dispatch attempts that failed due to a reason other than addrs collision ++## note 1 event:0x617082 counters:0 um:zero minimum:10000 name:PM_ISIDE_DISP : All i-side dispatch attempts ++## note 1 event:0x627084 counters:1 um:zero minimum:10000 name:PM_ISIDE_DISP_FAIL : All i-side dispatch attempts that failed due to a addr collision with another machine ++## note 1 event:0x627086 counters:1 um:zero minimum:10000 name:PM_ISIDE_DISP_FAIL_OTHER : All i-side dispatch attempts that failed due to a reason other than addrs collision + event:0x4608e counters:3 um:zero minimum:10000 name:PM_ISIDE_L2MEMACC : valid when first beat of data comes in for an i-side fetch where data came from mem(or L4) +-event:0x44608e counters:3 um:zero minimum:10000 name:PM_ISIDE_MRU_TOUCH : Iside L2 MRU touch ++## note 1 event:0x44608e counters:3 um:zero minimum:10000 name:PM_ISIDE_MRU_TOUCH : Iside L2 MRU touch + event:0xd096 counters:0,1,2,3 um:zero minimum:10000 name:PM_ISLB_MISS : I SLB Miss. + event:0x30ac counters:0,1,2,3 um:zero minimum:10000 name:PM_ISU_REF_FX0 : FX0 ISU reject + event:0x30ae counters:0,1,2,3 um:zero minimum:10000 name:PM_ISU_REF_FX1 : FX1 ISU reject +@@ -451,107 +465,107 @@ event:0x30a8 counters:0,1,2,3 um:zero minimum:10000 name:PM_ISU_REJ_VS0 : VS0 IS + event:0x30aa counters:0,1,2,3 um:zero minimum:10000 name:PM_ISU_REJ_VS1 : VS1 ISU reject + event:0x38a8 counters:0,1,2,3 um:zero minimum:10000 name:PM_ISU_REJ_VSU : ISU + event:0x30b8 counters:0,1,2,3 um:zero minimum:10000 name:PM_ISYNC : Isync count per thread +-event:0x200301ea counters:2 um:L1_latency minimum:10000 name:PM_L1MISS_LAT_EXC_1024 : Reload latency exceeded 1024 cyc +-event:0x200401ec counters:3 um:L1_latency minimum:10000 name:PM_L1MISS_LAT_EXC_2048 : Reload latency exceeded 2048 cyc +-event:0x200101e8 counters:0 um:L1_latency minimum:10000 name:PM_L1MISS_LAT_EXC_256 : Reload latency exceeded 256 cyc +-event:0x200201e6 counters:1 um:L1_latency minimum:10000 name:PM_L1MISS_LAT_EXC_32 : Reload latency exceeded 32 cyc ++## note1 event:0x200301ea counters:2 um:L1_latency minimum:10000 name:PM_L1MISS_LAT_EXC_1024 : Reload latency exceeded 1024 cyc ++## note1 event:0x200401ec counters:3 um:L1_latency minimum:10000 name:PM_L1MISS_LAT_EXC_2048 : Reload latency exceeded 2048 cyc ++## note1 event:0x200101e8 counters:0 um:L1_latency minimum:10000 name:PM_L1MISS_LAT_EXC_256 : Reload latency exceeded 256 cyc ++## note1 event:0x200201e6 counters:1 um:L1_latency minimum:10000 name:PM_L1MISS_LAT_EXC_32 : Reload latency exceeded 32 cyc + event:0x26086 counters:1 um:zero minimum:10000 name:PM_L1PF_L2MEMACC : valid when first beat of data comes in for an L1pref where data came from mem(or L4) + event:0x1002c counters:0 um:zero minimum:10000 name:PM_L1_DCACHE_RELOADED_ALL : L1 data cache reloaded for demand or prefetch . + event:0x408c counters:0,1,2,3 um:zero minimum:10000 name:PM_L1_DEMAND_WRITE : Instruction Demand sectors wriittent into IL1 + event:0x40012 counters:3 um:zero minimum:10000 name:PM_L1_ICACHE_RELOADED_ALL : Counts all Icache reloads includes demand, prefetchm prefetch turned into demand and demand turned into prefetch. + event:0x30068 counters:2 um:zero minimum:10000 name:PM_L1_ICACHE_RELOADED_PREF : Counts all Icache prefetch reloads ( includes demand turned into prefetch). +-event:0x417080 counters:0 um:zero minimum:10000 name:PM_L2_CASTOUT_MOD : L2 Castouts - Modified (M, Mu, Me) +-event:0x417082 counters:0 um:zero minimum:10000 name:PM_L2_CASTOUT_SHR : L2 Castouts - Shared (T, Te, Si, S) ++## note 1 event:0x417080 counters:0 um:zero minimum:10000 name:PM_L2_CASTOUT_MOD : L2 Castouts - Modified (M, Mu, Me) ++## note 1 event:0x417082 counters:0 um:zero minimum:10000 name:PM_L2_CASTOUT_SHR : L2 Castouts - Shared (T, Te, Si, S) + event:0x27084 counters:1 um:zero minimum:10000 name:PM_L2_CHIP_PUMP : RC requests that were local on chip pump attempts +-event:0x427086 counters:1 um:zero minimum:10000 name:PM_L2_DC_INV : Dcache invalidates from L2 +-event:0x44608c counters:3 um:zero minimum:10000 name:PM_L2_DISP_ALL_L2MISS : All successful Ld/St dispatches for this thread that were an L2miss. +-event:0x64608e counters:3 um:zero minimum:10000 name:PM_L2_GROUP_PUMP : RC requests that were on Node Pump attempts +-event:0x626084 counters:1 um:zero minimum:10000 name:PM_L2_GRP_GUESS_CORRECT : L2 guess grp and guess was correct (data intra-6chip AND ^on-chip) +-event:0x626086 counters:1 um:zero minimum:10000 name:PM_L2_GRP_GUESS_WRONG : L2 guess grp and guess was not correct (ie data on-chip OR beyond-6chip) +-event:0x427084 counters:1 um:zero minimum:10000 name:PM_L2_IC_INV : Icache Invalidates from L2 +-event:0x436088 counters:2 um:zero minimum:10000 name:PM_L2_INST : All successful I-side dispatches for this thread (excludes i_l2mru_tch reqs) +-event:0x43608a counters:2 um:zero minimum:10000 name:PM_L2_INST_MISS : All successful i-side dispatches that were an L2miss for this thread (excludes i_l2mru_tch reqs) +-event:0x416080 counters:0 um:zero minimum:10000 name:PM_L2_LD : All successful D-side Load dispatches for this thread +-event:0x437088 counters:2 um:zero minimum:10000 name:PM_L2_LD_DISP : All successful load dispatches +-event:0x43708a counters:2 um:zero minimum:10000 name:PM_L2_LD_HIT : All successful load dispatches that were L2 hits +-event:0x426084 counters:1 um:zero minimum:10000 name:PM_L2_LD_MISS : All successful D-Side Load dispatches that were an L2miss for this thread +-event:0x616080 counters:0 um:zero minimum:10000 name:PM_L2_LOC_GUESS_CORRECT : L2 guess loc and guess was correct (ie data local) +-event:0x616082 counters:0 um:zero minimum:10000 name:PM_L2_LOC_GUESS_WRONG : L2 guess loc and guess was not correct (ie data not on chip) +-event:0x516080 counters:0 um:zero minimum:10000 name:PM_L2_RCLD_DISP : L2 RC load dispatch attempt +-event:0x516082 counters:0 um:zero minimum:10000 name:PM_L2_RCLD_DISP_FAIL_ADDR : L2 RC load dispatch attempt failed due to address collision with RC/CO/SN/SQ +-event:0x526084 counters:1 um:zero minimum:10000 name:PM_L2_RCLD_DISP_FAIL_OTHER : L2 RC load dispatch attempt failed due to other reasons +-event:0x536088 counters:2 um:zero minimum:10000 name:PM_L2_RCST_DISP : L2 RC store dispatch attempt +-event:0x53608a counters:2 um:zero minimum:10000 name:PM_L2_RCST_DISP_FAIL_ADDR : L2 RC store dispatch attempt failed due to address collision with RC/CO/SN/SQ +-event:0x54608c counters:3 um:zero minimum:10000 name:PM_L2_RCST_DISP_FAIL_OTHER : L2 RC store dispatch attempt failed due to other reasons +-event:0x537088 counters:2 um:zero minimum:10000 name:PM_L2_RC_ST_DONE : RC did st to line that was Tx or Sx +-event:0x63708a counters:2 um:zero minimum:10000 name:PM_L2_RTY_LD : RC retries on PB for any load from core ++## note 1 event:0x427086 counters:1 um:zero minimum:10000 name:PM_L2_DC_INV : Dcache invalidates from L2 ++## note 1 event:0x44608c counters:3 um:zero minimum:10000 name:PM_L2_DISP_ALL_L2MISS : All successful Ld/St dispatches for this thread that were an L2miss. ++## note 1 event:0x64608e counters:3 um:zero minimum:10000 name:PM_L2_GROUP_PUMP : RC requests that were on Node Pump attempts ++## note 1 event:0x626084 counters:1 um:zero minimum:10000 name:PM_L2_GRP_GUESS_CORRECT : L2 guess grp and guess was correct (data intra-6chip AND ^on-chip) ++## note 1 event:0x626086 counters:1 um:zero minimum:10000 name:PM_L2_GRP_GUESS_WRONG : L2 guess grp and guess was not correct (ie data on-chip OR beyond-6chip) ++## note 1 event:0x427084 counters:1 um:zero minimum:10000 name:PM_L2_IC_INV : Icache Invalidates from L2 ++## note 1 event:0x436088 counters:2 um:zero minimum:10000 name:PM_L2_INST : All successful I-side dispatches for this thread (excludes i_l2mru_tch reqs) ++## note 1 event:0x43608a counters:2 um:zero minimum:10000 name:PM_L2_INST_MISS : All successful i-side dispatches that were an L2miss for this thread (excludes i_l2mru_tch reqs) ++## note 1 event:0x416080 counters:0 um:zero minimum:10000 name:PM_L2_LD : All successful D-side Load dispatches for this thread ++## note 1 event:0x437088 counters:2 um:zero minimum:10000 name:PM_L2_LD_DISP : All successful load dispatches ++## note 1 event:0x43708a counters:2 um:zero minimum:10000 name:PM_L2_LD_HIT : All successful load dispatches that were L2 hits ++## note 1 event:0x426084 counters:1 um:zero minimum:10000 name:PM_L2_LD_MISS : All successful D-Side Load dispatches that were an L2miss for this thread ++## note 1 event:0x616080 counters:0 um:zero minimum:10000 name:PM_L2_LOC_GUESS_CORRECT : L2 guess loc and guess was correct (ie data local) ++## note 1 event:0x616082 counters:0 um:zero minimum:10000 name:PM_L2_LOC_GUESS_WRONG : L2 guess loc and guess was not correct (ie data not on chip) ++## note 1 event:0x516080 counters:0 um:zero minimum:10000 name:PM_L2_RCLD_DISP : L2 RC load dispatch attempt ++## note 1 event:0x516082 counters:0 um:zero minimum:10000 name:PM_L2_RCLD_DISP_FAIL_ADDR : L2 RC load dispatch attempt failed due to address collision with RC/CO/SN/SQ ++## note 1 event:0x526084 counters:1 um:zero minimum:10000 name:PM_L2_RCLD_DISP_FAIL_OTHER : L2 RC load dispatch attempt failed due to other reasons ++## note 1 event:0x536088 counters:2 um:zero minimum:10000 name:PM_L2_RCST_DISP : L2 RC store dispatch attempt ++## note 1 event:0x53608a counters:2 um:zero minimum:10000 name:PM_L2_RCST_DISP_FAIL_ADDR : L2 RC store dispatch attempt failed due to address collision with RC/CO/SN/SQ ++## note 1 event:0x54608c counters:3 um:zero minimum:10000 name:PM_L2_RCST_DISP_FAIL_OTHER : L2 RC store dispatch attempt failed due to other reasons ++## note 1 event:0x537088 counters:2 um:zero minimum:10000 name:PM_L2_RC_ST_DONE : RC did st to line that was Tx or Sx ++## note 1 event:0x63708a counters:2 um:zero minimum:10000 name:PM_L2_RTY_LD : RC retries on PB for any load from core + event:0x3708a counters:2 um:zero minimum:10000 name:PM_L2_RTY_ST : RC retries on PB for any store from core +-event:0x54708c counters:3 um:zero minimum:10000 name:PM_L2_SN_M_RD_DONE : SNP dispatched for a read and was M +-event:0x54708e counters:3 um:zero minimum:10000 name:PM_L2_SN_M_WR_DONE : SNP dispatched for a write and was M +-event:0x53708a counters:2 um:zero minimum:10000 name:PM_L2_SN_SX_I_DONE : SNP dispatched and went from Sx or Tx to Ix ++## note 1 event:0x54708c counters:3 um:zero minimum:10000 name:PM_L2_SN_M_RD_DONE : SNP dispatched for a read and was M ++## note 1 event:0x54708e counters:3 um:zero minimum:10000 name:PM_L2_SN_M_WR_DONE : SNP dispatched for a write and was M ++## note 1 event:0x53708a counters:2 um:zero minimum:10000 name:PM_L2_SN_SX_I_DONE : SNP dispatched and went from Sx or Tx to Ix + event:0x17080 counters:0 um:zero minimum:10000 name:PM_L2_ST : All successful D-side store dispatches for this thread +-event:0x44708c counters:3 um:zero minimum:10000 name:PM_L2_ST_DISP : All successful store dispatches +-event:0x44708e counters:3 um:zero minimum:10000 name:PM_L2_ST_HIT : All successful store dispatches that were L2Hits ++## note 1 event:0x44708c counters:3 um:zero minimum:10000 name:PM_L2_ST_DISP : All successful store dispatches ++## note 1 event:0x44708e counters:3 um:zero minimum:10000 name:PM_L2_ST_HIT : All successful store dispatches that were L2Hits + event:0x17082 counters:0 um:zero minimum:10000 name:PM_L2_ST_MISS : All successful D-side store dispatches for this thread that were L2 Miss +-event:0x636088 counters:2 um:zero minimum:10000 name:PM_L2_SYS_GUESS_CORRECT : L2 guess sys and guess was correct (ie data beyond-6chip) +-event:0x63608a counters:2 um:zero minimum:10000 name:PM_L2_SYS_GUESS_WRONG : L2 guess sys and guess was not correct (ie data ^beyond-6chip) ++## note 1 event:0x636088 counters:2 um:zero minimum:10000 name:PM_L2_SYS_GUESS_CORRECT : L2 guess sys and guess was correct (ie data beyond-6chip) ++## note 1 event:0x63608a counters:2 um:zero minimum:10000 name:PM_L2_SYS_GUESS_WRONG : L2 guess sys and guess was not correct (ie data ^beyond-6chip) + event:0x37088 counters:2 um:zero minimum:10000 name:PM_L2_SYS_PUMP : RC requests that were system pump attempts + event:0x1e05e counters:0 um:zero minimum:10000 name:PM_L2_TM_REQ_ABORT : TM abort. + event:0x3e05c counters:2 um:zero minimum:10000 name:PM_L2_TM_ST_ABORT_SISTER : TM marked store abort. +-event:0x23808a counters:2 um:zero minimum:10000 name:PM_L3_CINJ : l3 ci of cache inject +-event:0x128084 counters:1 um:zero minimum:10000 name:PM_L3_CI_HIT : L3 Castins Hit (total count +-event:0x128086 counters:1 um:zero minimum:10000 name:PM_L3_CI_MISS : L3 castins miss (total count ++## note1 event:0x23808a counters:2 um:zero minimum:10000 name:PM_L3_CINJ : l3 ci of cache inject ++## note1 event:0x128084 counters:1 um:zero minimum:10000 name:PM_L3_CI_HIT : L3 Castins Hit (total count ++## note1 event:0x128086 counters:1 um:zero minimum:10000 name:PM_L3_CI_MISS : L3 castins miss (total count + event:0x819082 counters:0 um:zero minimum:10000 name:PM_L3_CI_USAGE : rotating sample of 16 CI or CO actives +-event:0x438088 counters:2 um:zero minimum:10000 name:PM_L3_CO : l3 castout occuring ( does not include casthrough or log writes (cinj/dmaw) ++## note 1 event:0x438088 counters:2 um:zero minimum:10000 name:PM_L3_CO : l3 castout occuring ( does not include casthrough or log writes (cinj/dmaw) + event:0x83908b counters:2 um:zero minimum:10000 name:PM_L3_CO0_ALLOC : 0.0 + event:0x83908a counters:2 um:zero minimum:10000 name:PM_L3_CO0_BUSY : lifetime, sample of CO machine 0 valid + event:0x28086 counters:1 um:zero minimum:10000 name:PM_L3_CO_L31 : L3 CO to L3.1 OR of port 0 and 1 ( lossy) +-event:0x238088 counters:2 um:zero minimum:10000 name:PM_L3_CO_LCO : Total L3 castouts occurred on LCO ++## note1 event:0x238088 counters:2 um:zero minimum:10000 name:PM_L3_CO_LCO : Total L3 castouts occurred on LCO + event:0x28084 counters:1 um:zero minimum:10000 name:PM_L3_CO_MEM : L3 CO to memory OR of port 0 and 1 ( lossy) + event:0x18082 counters:0 um:zero minimum:10000 name:PM_L3_CO_MEPF : L3 CO of line in Mep state ( includes casthrough) +-event:0xb19082 counters:0 um:zero minimum:10000 name:PM_L3_GRP_GUESS_CORRECT : Initial scope=group and data from same group (near) (pred successful) +-event:0xb3908a counters:2 um:zero minimum:10000 name:PM_L3_GRP_GUESS_WRONG_HIGH : Initial scope=group but data from local node. Predition too high +-event:0xb39088 counters:2 um:zero minimum:10000 name:PM_L3_GRP_GUESS_WRONG_LOW : Initial scope=group but data from outside group (far or rem). Prediction too Low +-event:0x218080 counters:0 um:zero minimum:10000 name:PM_L3_HIT : L3 Hits +-event:0x138088 counters:2 um:zero minimum:10000 name:PM_L3_L2_CO_HIT : L2 castout hits +-event:0x13808a counters:2 um:zero minimum:10000 name:PM_L3_L2_CO_MISS : L2 castout miss +-event:0x14808c counters:3 um:zero minimum:10000 name:PM_L3_LAT_CI_HIT : L3 Lateral Castins Hit +-event:0x14808e counters:3 um:zero minimum:10000 name:PM_L3_LAT_CI_MISS : L3 Lateral Castins Miss +-event:0x228084 counters:1 um:zero minimum:10000 name:PM_L3_LD_HIT : L3 demand LD Hits +-event:0x228086 counters:1 um:zero minimum:10000 name:PM_L3_LD_MISS : L3 demand LD Miss ++## note 1 event:0xb19082 counters:0 um:zero minimum:10000 name:PM_L3_GRP_GUESS_CORRECT : Initial scope=group and data from same group (near) (pred successful) ++## note 1 event:0xb3908a counters:2 um:zero minimum:10000 name:PM_L3_GRP_GUESS_WRONG_HIGH : Initial scope=group but data from local node. Predition too high ++## note 1 event:0xb39088 counters:2 um:zero minimum:10000 name:PM_L3_GRP_GUESS_WRONG_LOW : Initial scope=group but data from outside group (far or rem). Prediction too Low ++## note 1 event:0x218080 counters:0 um:zero minimum:10000 name:PM_L3_HIT : L3 Hits ++## note 1 event:0x138088 counters:2 um:zero minimum:10000 name:PM_L3_L2_CO_HIT : L2 castout hits ++## note 1 event:0x13808a counters:2 um:zero minimum:10000 name:PM_L3_L2_CO_MISS : L2 castout miss ++## note 1 event:0x14808c counters:3 um:zero minimum:10000 name:PM_L3_LAT_CI_HIT : L3 Lateral Castins Hit ++## note 1 event:0x14808e counters:3 um:zero minimum:10000 name:PM_L3_LAT_CI_MISS : L3 Lateral Castins Miss ++## note 1 event:0x228084 counters:1 um:zero minimum:10000 name:PM_L3_LD_HIT : L3 demand LD Hits ++## note 1 event:0x228086 counters:1 um:zero minimum:10000 name:PM_L3_LD_MISS : L3 demand LD Miss + event:0x1e052 counters:0 um:zero minimum:10000 name:PM_L3_LD_PREF : L3 Load Prefetches. +-event:0xb19080 counters:0 um:zero minimum:10000 name:PM_L3_LOC_GUESS_CORRECT : initial scope=node/chip and data from local node (local) (pred successful) +-event:0xb29086 counters:1 um:zero minimum:10000 name:PM_L3_LOC_GUESS_WRONG : Initial scope=node but data from out side local node (near or far or rem). Prediction too Low +-event:0x218082 counters:0 um:zero minimum:10000 name:PM_L3_MISS : L3 Misses +-event:0x54808c counters:3 um:zero minimum:10000 name:PM_L3_P0_CO_L31 : l3 CO to L3.1 (lco) port 0 +-event:0x538088 counters:2 um:zero minimum:10000 name:PM_L3_P0_CO_MEM : l3 CO to memory port 0 +-event:0x929084 counters:1 um:zero minimum:10000 name:PM_L3_P0_CO_RTY : L3 CO received retry port 0 +-event:0xa29084 counters:1 um:zero minimum:10000 name:PM_L3_P0_GRP_PUMP : L3 pf sent with grp scope port 0 +-event:0x528084 counters:1 um:zero minimum:10000 name:PM_L3_P0_LCO_DATA : lco sent with data port 0 +-event:0x518080 counters:0 um:zero minimum:10000 name:PM_L3_P0_LCO_NO_DATA : dataless l3 lco sent port 0 +-event:0xa4908c counters:3 um:zero minimum:10000 name:PM_L3_P0_LCO_RTY : L3 LCO received retry port 0 +-event:0xa19080 counters:0 um:zero minimum:10000 name:PM_L3_P0_NODE_PUMP : L3 pf sent with nodal scope port 0 +-event:0x919080 counters:0 um:zero minimum:10000 name:PM_L3_P0_PF_RTY : L3 PF received retry port 0 +-event:0x939088 counters:2 um:zero minimum:10000 name:PM_L3_P0_SN_HIT : L3 snoop hit port 0 +-event:0x118080 counters:0 um:zero minimum:10000 name:PM_L3_P0_SN_INV : Port0 snooper detects someone doing a store to a line thats Sx +-event:0x94908c counters:3 um:zero minimum:10000 name:PM_L3_P0_SN_MISS : L3 snoop miss port 0 +-event:0xa39088 counters:2 um:zero minimum:10000 name:PM_L3_P0_SYS_PUMP : L3 pf sent with sys scope port 0 +-event:0x54808e counters:3 um:zero minimum:10000 name:PM_L3_P1_CO_L31 : l3 CO to L3.1 (lco) port 1 +-event:0x53808a counters:2 um:zero minimum:10000 name:PM_L3_P1_CO_MEM : l3 CO to memory port 1 +-event:0x929086 counters:1 um:zero minimum:10000 name:PM_L3_P1_CO_RTY : L3 CO received retry port 1 +-event:0xa29086 counters:1 um:zero minimum:10000 name:PM_L3_P1_GRP_PUMP : L3 pf sent with grp scope port 1 +-event:0x528086 counters:1 um:zero minimum:10000 name:PM_L3_P1_LCO_DATA : lco sent with data port 1 +-event:0x518082 counters:0 um:zero minimum:10000 name:PM_L3_P1_LCO_NO_DATA : dataless l3 lco sent port 1 +-event:0xa4908e counters:3 um:zero minimum:10000 name:PM_L3_P1_LCO_RTY : L3 LCO received retry port 1 +-event:0xa19082 counters:0 um:zero minimum:10000 name:PM_L3_P1_NODE_PUMP : L3 pf sent with nodal scope port 1 +-event:0x919082 counters:0 um:zero minimum:10000 name:PM_L3_P1_PF_RTY : L3 PF received retry port 1 +-event:0x93908a counters:2 um:zero minimum:10000 name:PM_L3_P1_SN_HIT : L3 snoop hit port 1 +-event:0x118082 counters:0 um:zero minimum:10000 name:PM_L3_P1_SN_INV : Port1 snooper detects someone doing a store to a line thats Sx +-event:0x94908e counters:3 um:zero minimum:10000 name:PM_L3_P1_SN_MISS : L3 snoop miss port 1 +-event:0xa3908a counters:2 um:zero minimum:10000 name:PM_L3_P1_SYS_PUMP : L3 pf sent with sys scope port 1 ++## note 1 event:0xb19080 counters:0 um:zero minimum:10000 name:PM_L3_LOC_GUESS_CORRECT : initial scope=node/chip and data from local node (local) (pred successful) ++## note 1 event:0xb29086 counters:1 um:zero minimum:10000 name:PM_L3_LOC_GUESS_WRONG : Initial scope=node but data from out side local node (near or far or rem). Prediction too Low ++## note 1 event:0x218082 counters:0 um:zero minimum:10000 name:PM_L3_MISS : L3 Misses ++## note 1 event:0x54808c counters:3 um:zero minimum:10000 name:PM_L3_P0_CO_L31 : l3 CO to L3.1 (lco) port 0 ++## note 1 event:0x538088 counters:2 um:zero minimum:10000 name:PM_L3_P0_CO_MEM : l3 CO to memory port 0 ++## note 1 event:0x929084 counters:1 um:zero minimum:10000 name:PM_L3_P0_CO_RTY : L3 CO received retry port 0 ++## note 1 event:0xa29084 counters:1 um:zero minimum:10000 name:PM_L3_P0_GRP_PUMP : L3 pf sent with grp scope port 0 ++## note 1 event:0x528084 counters:1 um:zero minimum:10000 name:PM_L3_P0_LCO_DATA : lco sent with data port 0 ++## note 1 event:0x518080 counters:0 um:zero minimum:10000 name:PM_L3_P0_LCO_NO_DATA : dataless l3 lco sent port 0 ++## note 1 event:0xa4908c counters:3 um:zero minimum:10000 name:PM_L3_P0_LCO_RTY : L3 LCO received retry port 0 ++## note 1 event:0xa19080 counters:0 um:zero minimum:10000 name:PM_L3_P0_NODE_PUMP : L3 pf sent with nodal scope port 0 ++## note 1 event:0x919080 counters:0 um:zero minimum:10000 name:PM_L3_P0_PF_RTY : L3 PF received retry port 0 ++## note 1 event:0x939088 counters:2 um:zero minimum:10000 name:PM_L3_P0_SN_HIT : L3 snoop hit port 0 ++## note 1 event:0x118080 counters:0 um:zero minimum:10000 name:PM_L3_P0_SN_INV : Port0 snooper detects someone doing a store to a line thats Sx ++## note 1 event:0x94908c counters:3 um:zero minimum:10000 name:PM_L3_P0_SN_MISS : L3 snoop miss port 0 ++## note 1 event:0xa39088 counters:2 um:zero minimum:10000 name:PM_L3_P0_SYS_PUMP : L3 pf sent with sys scope port 0 ++## note 1 event:0x54808e counters:3 um:zero minimum:10000 name:PM_L3_P1_CO_L31 : l3 CO to L3.1 (lco) port 1 ++## note 1 event:0x53808a counters:2 um:zero minimum:10000 name:PM_L3_P1_CO_MEM : l3 CO to memory port 1 ++## note 1 event:0x929086 counters:1 um:zero minimum:10000 name:PM_L3_P1_CO_RTY : L3 CO received retry port 1 ++## note 1 event:0xa29086 counters:1 um:zero minimum:10000 name:PM_L3_P1_GRP_PUMP : L3 pf sent with grp scope port 1 ++## note 1 event:0x528086 counters:1 um:zero minimum:10000 name:PM_L3_P1_LCO_DATA : lco sent with data port 1 ++## note 1 event:0x518082 counters:0 um:zero minimum:10000 name:PM_L3_P1_LCO_NO_DATA : dataless l3 lco sent port 1 ++## note 1 event:0xa4908e counters:3 um:zero minimum:10000 name:PM_L3_P1_LCO_RTY : L3 LCO received retry port 1 ++## note 1 event:0xa19082 counters:0 um:zero minimum:10000 name:PM_L3_P1_NODE_PUMP : L3 pf sent with nodal scope port 1 ++## note 1 event:0x919082 counters:0 um:zero minimum:10000 name:PM_L3_P1_PF_RTY : L3 PF received retry port 1 ++## note 1 event:0x93908a counters:2 um:zero minimum:10000 name:PM_L3_P1_SN_HIT : L3 snoop hit port 1 ++## note 1 event:0x118082 counters:0 um:zero minimum:10000 name:PM_L3_P1_SN_INV : Port1 snooper detects someone doing a store to a line thats Sx ++## note 1 event:0x94908e counters:3 um:zero minimum:10000 name:PM_L3_P1_SN_MISS : L3 snoop miss port 1 ++## note 1 event:0xa3908a counters:2 um:zero minimum:10000 name:PM_L3_P1_SYS_PUMP : L3 pf sent with sys scope port 1 + event:0x84908d counters:3 um:zero minimum:10000 name:PM_L3_PF0_ALLOC : 0.0 + event:0x84908c counters:3 um:zero minimum:10000 name:PM_L3_PF0_BUSY : lifetime, sample of PF machine 0 valid +-event:0x428084 counters:1 um:zero minimum:10000 name:PM_L3_PF_HIT_L3 : l3 pf hit in l3 ++## note 1 event:0x428084 counters:1 um:zero minimum:10000 name:PM_L3_PF_HIT_L3 : l3 pf hit in l3 + event:0x18080 counters:0 um:zero minimum:10000 name:PM_L3_PF_MISS_L3 : L3 Prefetch missed in L3 + event:0x3808a counters:2 um:zero minimum:10000 name:PM_L3_PF_OFF_CHIP_CACHE : L3 Prefetch from Off chip cache + event:0x4808e counters:3 um:zero minimum:10000 name:PM_L3_PF_OFF_CHIP_MEM : L3 Prefetch from Off chip memory +@@ -567,12 +581,12 @@ event:0x839088 counters:2 um:zero minimum:10000 name:PM_L3_SN0_BUSY : lifetime, + event:0x819080 counters:0 um:zero minimum:10000 name:PM_L3_SN_USAGE : rotating sample of 8 snoop valids + event:0x2e052 counters:1 um:zero minimum:10000 name:PM_L3_ST_PREF : L3 store Prefetches. + event:0x3e052 counters:2 um:zero minimum:10000 name:PM_L3_SW_PREF : Data stream touchto L3. +-event:0xb29084 counters:1 um:zero minimum:10000 name:PM_L3_SYS_GUESS_CORRECT : Initial scope=system and data from outside group (far or rem)(pred successful) +-event:0xb4908c counters:3 um:zero minimum:10000 name:PM_L3_SYS_GUESS_WRONG : Initial scope=system but data from local or near. Predction too high +-event:0x24808e counters:3 um:zero minimum:10000 name:PM_L3_TRANS_PF : L3 Transient prefetch ++## note 1 event:0xb29084 counters:1 um:zero minimum:10000 name:PM_L3_SYS_GUESS_CORRECT : Initial scope=system and data from outside group (far or rem)(pred successful) ++## note 1 event:0xb4908c counters:3 um:zero minimum:10000 name:PM_L3_SYS_GUESS_WRONG : Initial scope=system but data from local or near. Predction too high ++## note 1 event:0x24808e counters:3 um:zero minimum:10000 name:PM_L3_TRANS_PF : L3 Transient prefetch + event:0x18081 counters:0 um:zero minimum:10000 name:PM_L3_WI0_ALLOC : 0.0 +-event:0x418080 counters:0 um:zero minimum:10000 name:PM_L3_WI0_BUSY : lifetime, sample of Write Inject machine 0 valid +-event:0x418082 counters:0 um:zero minimum:10000 name:PM_L3_WI_USAGE : rotating sample of 8 WI actives ++## note 1 event:0x418080 counters:0 um:zero minimum:10000 name:PM_L3_WI0_BUSY : lifetime, sample of Write Inject machine 0 valid ++## note 1 event:0x418082 counters:0 um:zero minimum:10000 name:PM_L3_WI_USAGE : rotating sample of 8 WI actives + event:0x3c058 counters:2 um:zero minimum:10000 name:PM_LARX_FIN : Larx finished . + event:0x1002e counters:0 um:zero minimum:10000 name:PM_LD_CMPL : count of Loads completed. + event:0x10062 counters:0 um:zero minimum:10000 name:PM_LD_L3MISS_PEND_CYC : Cycles L3 miss was pending for this thread. +@@ -853,7 +867,7 @@ event:0x3d15e counters:2 um:zero minimum:10000 name:PM_MULT_MRK : mult marked in + event:0x20b0 counters:0,1,2,3 um:zero minimum:10000 name:PM_NESTED_TEND : Completion time nested tend + event:0x3006e counters:2 um:zero minimum:10000 name:PM_NEST_REF_CLK : Nest reference clocks. + event:0x20b6 counters:0,1,2,3 um:zero minimum:10000 name:PM_NON_FAV_TBEGIN : Dispatch time non favored tbegin +-event:0x328084 counters:1 um:zero minimum:10000 name:PM_NON_TM_RST_SC : non tm snp rst tm sc ++## note 1 event:0x328084 counters:1 um:zero minimum:10000 name:PM_NON_TM_RST_SC : non tm snp rst tm sc + event:0x2001a counters:1 um:zero minimum:10000 name:PM_NTCG_ALL_FIN : Ccycles after all instructions have finished to group completed. + event:0x20ac counters:0,1,2,3 um:zero minimum:10000 name:PM_OUTER_TBEGIN : Completion time outer tbegin + event:0x20ae counters:0,1,2,3 um:zero minimum:10000 name:PM_OUTER_TEND : Completion time outer tend +@@ -879,14 +893,14 @@ event:0x10054 counters:0 um:zero minimum:10000 name:PM_PUMP_CPRED : Pump predict + event:0x40052 counters:3 um:zero minimum:10000 name:PM_PUMP_MPRED : Pump Mis prediction Counts across all types of pumpsfor all data types excluding data prefetch (demand load,inst prefetch,inst fetch,xlate). + event:0x16081 counters:0 um:zero minimum:10000 name:PM_RC0_ALLOC : 0.0 + event:0x16080 counters:0 um:zero minimum:10000 name:PM_RC0_BUSY : RC mach 0 Busy. Used by PMU to sample ave RC livetime(mach0 used as sample point) +-event:0x200301ea counters:2 um:rc_machine minimum:10000 name:PM_RC_LIFETIME_EXC_1024 : Reload latency exceeded 1024 cyc +-event:0x200401ec counters:3 um:rc_machine minimum:10000 name:PM_RC_LIFETIME_EXC_2048 : Threshold counter exceeded a value of 2048 +-event:0x200101e8 counters:0 um:rc_machine minimum:10000 name:PM_RC_LIFETIME_EXC_256 : Threshold counter exceed a count of 256 +-event:0x200201e6 counters:1 um:rc_machine minimum:10000 name:PM_RC_LIFETIME_EXC_32 : Reload latency exceeded 32 cyc ++## note 1 event:0x200301ea counters:2 um:rc_machine minimum:10000 name:PM_RC_LIFETIME_EXC_1024 : Reload latency exceeded 1024 cyc ++## note 1 event:0x200401ec counters:3 um:rc_machine minimum:10000 name:PM_RC_LIFETIME_EXC_2048 : Threshold counter exceeded a value of 2048 ++## note 1 event:0x200101e8 counters:0 um:rc_machine minimum:10000 name:PM_RC_LIFETIME_EXC_256 : Threshold counter exceed a count of 256 ++## note 1 event:0x200201e6 counters:1 um:rc_machine minimum:10000 name:PM_RC_LIFETIME_EXC_32 : Reload latency exceeded 32 cyc + event:0x36088 counters:2 um:zero minimum:10000 name:PM_RC_USAGE : Continuous 16 cycle(2to1) window where this signals rotates thru sampling each L2 RC machine busy. PMU uses this wave to then do 16 cyc count to sample total number of machs running +-event:0x34808e counters:3 um:zero minimum:10000 name:PM_RD_CLEARING_SC : rd clearing sc +-event:0x34808c counters:3 um:zero minimum:10000 name:PM_RD_FORMING_SC : rd forming sc +-event:0x428086 counters:1 um:zero minimum:10000 name:PM_RD_HIT_PF : rd machine hit l3 pf machine ++## note 1 event:0x34808e counters:3 um:zero minimum:10000 name:PM_RD_CLEARING_SC : rd clearing sc ++## note 1 event:0x34808c counters:3 um:zero minimum:10000 name:PM_RD_FORMING_SC : rd forming sc ++## note 1 event:0x428086 counters:1 um:zero minimum:10000 name:PM_RD_HIT_PF : rd machine hit l3 pf machine + event:0x20004 counters:1 um:zero minimum:10000 name:PM_REAL_SRQ_FULL : Out of real srq entries. + event:0x3006c counters:2 um:zero minimum:10000 name:PM_RUN_CYC_SMT2_MODE : Cycles run latch is set and core is in SMT2 mode. + event:0x2006a counters:1 um:zero minimum:10000 name:PM_RUN_CYC_SMT2_SHRD_MODE : Cycles run latch is set and core is in SMT2-shared mode. +@@ -902,13 +916,13 @@ event:0x5090 counters:0,1,2,3 um:zero minimum:10000 name:PM_SHL_ST_DISABLE : Sto + event:0x26085 counters:1 um:zero minimum:10000 name:PM_SN0_ALLOC : 0.0 + event:0x26084 counters:1 um:zero minimum:10000 name:PM_SN0_BUSY : SN mach 0 Busy. Used by PMU to sample ave RC livetime(mach0 used as sample point) + event:0xd0b2 counters:0,1,2,3 um:zero minimum:10000 name:PM_SNOOP_TLBIE : TLBIE snoopSnoop TLBIE +-event:0x338088 counters:2 um:zero minimum:10000 name:PM_SNP_TM_HIT_M : snp tm st hit m mu +-event:0x33808a counters:2 um:zero minimum:10000 name:PM_SNP_TM_HIT_T : snp tm_st_hit t tn te ++## note 1 event:0x338088 counters:2 um:zero minimum:10000 name:PM_SNP_TM_HIT_M : snp tm st hit m mu ++## note 1 event:0x33808a counters:2 um:zero minimum:10000 name:PM_SNP_TM_HIT_T : snp tm_st_hit t tn te + event:0x4608c counters:3 um:zero minimum:10000 name:PM_SN_USAGE : Continuous 16 cycle(2to1) window where this signals rotates thru sampling each L2 SN machine busy. PMU uses this wave to then do 16 cyc count to sample total number of machs running + event:0x10028 counters:0 um:zero minimum:10000 name:PM_STALL_END_GCT_EMPTY : Count ended because GCT went empty. + event:0x1e058 counters:0 um:zero minimum:10000 name:PM_STCX_FAIL : stcx failed . + event:0xc090 counters:0,1,2,3 um:zero minimum:10000 name:PM_STCX_LSU : STCX executed reported at sent to nest42 +-event:0x717080 counters:0 um:zero minimum:10000 name:PM_ST_CAUSED_FAIL : Non TM St caused any thread to fail ++## note 1 event:0x717080 counters:0 um:zero minimum:10000 name:PM_ST_CAUSED_FAIL : Non TM St caused any thread to fail + event:0x20016 counters:1 um:zero minimum:10000 name:PM_ST_CMPL : Store completion count. + event:0x20018 counters:1 um:zero minimum:10000 name:PM_ST_FWD : Store forwards that finished. + event:0x0 counters:0,1,2,3 um:zero minimum:10000 name:PM_SUSPENDED : Counter OFF. +@@ -941,8 +955,8 @@ event:0x4016e counters:3 um:zero minimum:10000 name:PM_THRESH_NOT_MET : Threshol + event:0x30058 counters:2 um:zero minimum:10000 name:PM_TLBIE_FIN : tlbie finished. + event:0x20066 counters:1 um:zero minimum:10000 name:PM_TLB_MISS : TLB Miss (I + D). + event:0x20b8 counters:0,1,2,3 um:zero minimum:10000 name:PM_TM_BEGIN_ALL : Tm any tbegin +-event:0x318082 counters:0 um:zero minimum:10000 name:PM_TM_CAM_OVERFLOW : l3 tm cam overflow during L2 co of SC +-event:0x74708c counters:3 um:zero minimum:10000 name:PM_TM_CAP_OVERFLOW : TM Footprint Capactiy Overflow ++## note 1 event:0x318082 counters:0 um:zero minimum:10000 name:PM_TM_CAM_OVERFLOW : l3 tm cam overflow during L2 co of SC ++## note 1 event:0x74708c counters:3 um:zero minimum:10000 name:PM_TM_CAP_OVERFLOW : TM Footprint Capactiy Overflow + event:0x20ba counters:0,1,2,3 um:zero minimum:10000 name:PM_TM_END_ALL : Tm any tend + event:0x3086 counters:0,1,2,3 um:zero minimum:10000 name:PM_TM_FAIL_CONF_NON_TM : TEXAS fail reason @ completion + event:0x3088 counters:0,1,2,3 um:zero minimum:10000 name:PM_TM_FAIL_CON_TM : TEXAS fail reason @ completion +@@ -952,13 +966,13 @@ event:0xe0b8 counters:0,1,2,3 um:zero minimum:10000 name:PM_TM_FAIL_NON_TX_CONFL + event:0x308a counters:0,1,2,3 um:zero minimum:10000 name:PM_TM_FAIL_SELF : TEXAS fail reason @ completion + event:0xe0b4 counters:0,1,2,3 um:zero minimum:10000 name:PM_TM_FAIL_TLBIE : TLBIE hit bloom filter42 + event:0xe0b6 counters:0,1,2,3 um:zero minimum:10000 name:PM_TM_FAIL_TX_CONFLICT : Transactional conflict from LSU, whatever gets reported to texas 42 +-event:0x727086 counters:1 um:zero minimum:10000 name:PM_TM_FAV_CAUSED_FAIL : TM Load (fav) caused another thread to fail +-event:0x717082 counters:0 um:zero minimum:10000 name:PM_TM_LD_CAUSED_FAIL : Non TM Ld caused any thread to fail +-event:0x727084 counters:1 um:zero minimum:10000 name:PM_TM_LD_CONF : TM Load (fav or non-fav) ran into conflict (failed) +-event:0x328086 counters:1 um:zero minimum:10000 name:PM_TM_RST_SC : tm snp rst tm sc +-event:0x318080 counters:0 um:zero minimum:10000 name:PM_TM_SC_CO : l3 castout tm Sc line +-event:0x73708a counters:2 um:zero minimum:10000 name:PM_TM_ST_CAUSED_FAIL : TM Store (fav or non-fav) caused another thread to fail +-event:0x737088 counters:2 um:zero minimum:10000 name:PM_TM_ST_CONF : TM Store (fav or non-fav) ran into conflict (failed) ++## note 1 event:0x727086 counters:1 um:zero minimum:10000 name:PM_TM_FAV_CAUSED_FAIL : TM Load (fav) caused another thread to fail ++## note 1 event:0x717082 counters:0 um:zero minimum:10000 name:PM_TM_LD_CAUSED_FAIL : Non TM Ld caused any thread to fail ++## note 1 event:0x727084 counters:1 um:zero minimum:10000 name:PM_TM_LD_CONF : TM Load (fav or non-fav) ran into conflict (failed) ++## note 1 event:0x328086 counters:1 um:zero minimum:10000 name:PM_TM_RST_SC : tm snp rst tm sc ++## note 1 event:0x318080 counters:0 um:zero minimum:10000 name:PM_TM_SC_CO : l3 castout tm Sc line ++## note 1 event:0x73708a counters:2 um:zero minimum:10000 name:PM_TM_ST_CAUSED_FAIL : TM Store (fav or non-fav) caused another thread to fail ++## note 1 event:0x737088 counters:2 um:zero minimum:10000 name:PM_TM_ST_CONF : TM Store (fav or non-fav) ran into conflict (failed) + event:0x20bc counters:0,1,2,3 um:zero minimum:10000 name:PM_TM_TBEGIN : Tm nested tbegin + event:0x10060 counters:0 um:zero minimum:10000 name:PM_TM_TRANS_RUN_CYC : run cycles in transactional state. + event:0x30060 counters:2 um:zero minimum:10000 name:PM_TM_TRANS_RUN_INST : Instructions completed in transactional state. +commit 54bd5569033f7ec395e47efc5264d95e48907475 +Author: William Cohen +Date: Thu Nov 19 16:29:22 2015 -0500 + + Remove unused Power 8 unit masks + + To prevent people from using PMU events that the kernel would reject + on Power 8 commit 34715734fd6f commented out those events. However, + additional checks in oprofile code would note that some of the unit + masks were unused due to those commented out events and prevent + oprofile tools from running. The unused unit masks have been + commented out to pass these checks. + + Signed-off-by: William Cohen + +diff --git a/events/ppc64/power8/unit_masks b/events/ppc64/power8/unit_masks +index 203af97..96b32c0 100644 +--- a/events/ppc64/power8/unit_masks ++++ b/events/ppc64/power8/unit_masks +@@ -9,9 +9,26 @@ + # to workaround oprofile's 32-bit limitation for event codes. + # See libpe_utils/op_pe_utils.cpp:_get_event_code for how these codes are + # used. ++# ++#note 1. 11/12/2015 ++# ++# Some event requires the cache selector bits to be set to a non-zero ++# value in the processor performance counter setup register. On Power 8, this ++# register is only writable by the hypervisor. So the kernel must reject any ++# event where the lower three cache selector bits (bits 22:20) are not equal ++# to 0. If/when an API is implemented to allow the kernel to request the ++# hypervisor write the register with the required value, these events can be ++# re-added to the list of supported events. The issue is documented in the ++# powerpc kernel file arch/powerpc/perf/power8-pmu.c in function power8_get_constraint() ++# where the cache bits are ANDed with 0x7 if the unit is between 6 and 9. If ++# cache bits are not zero, the function returns -1 to reject the event. ++# ++# The associated unit masks for these problem events are unused and also need ++# to be commented out. ++# + name:zero type:mandatory default:0x0 + 0x0 No unit mask +-name:rc_machine type:mandatory default:0xde +- 0xde Thresholdable start/stop for rc machine for sampled instruction +-name:L1_latency type:mandatory default:0x67 +- 0x67 Thresholdable start/stop for L1 sampled instruction load miss/reload ++## note 1 name:rc_machine type:mandatory default:0xde ++## note 1 0xde Thresholdable start/stop for rc machine for sampled instruction ++## note 1 name:L1_latency type:mandatory default:0x67 ++## note 1 0x67 Thresholdable start/stop for L1 sampled instruction load miss/reload +commit cfecfbfa3e5c76ab544f64946af38a7f2efec9a3 +Author: Carl E. Love +Date: Mon Dec 14 14:18:35 2015 -0800 + + Remove Powerpc OProfile events the kernel will reject + + Will, Rei: + + As I suspected in the last email, there is a second issue that has to do + with what version of the libpfm library OProfile is using. Initially a + subset of the OProfile events for Power 8 were added to libpfm4.5. + Later the complete set of events was added to libpfm4.6. So, My first + attempt at removing the events that perf was rejecting inadvertently + included events that were in libpfm 4.6 but not libpfm 4.5. My version + of oprofile was used a patched version of libpfm 4.4 that effectively + made it libpfm 4.5. I redid the patch an verified that when Oprofile is + built with libpfm 4.6 there are no rejected events. I added a comment + in the event file as a heads up to this effect. + + Please take a look at the patch and see if it works OK for you. Sorry + for the delay in getting this out. + + Carl Love + + ---------------------------- + + Re-enable Power 8 events that the kernel does not reject. + + The previous patch to remove Power 8 events that were being rejected by + the kernel also removed events that were actually being rejected by + OProfile. OProfile was rejecting the events on the test machine because + the test machine used a version of libpfm that did not have all of the + Power 8 events. This patch re-enables the Power 8 events that are + not rejected by the kernel. + + Libpfm 4.5 only contains a subset of all the available Power 8 events. + The complete list of Power 8 events is supported by libpfm 4.6. To use + all of the events in this file, OProfile must be compiled with + the libpfm 4.6 library or newer. Otherwise, OProfile will reject the + event if it is not in the libpfm 4.5 library. + + Signed-off-by: Carl Love + +diff --git a/events/ppc64/power8/events b/events/ppc64/power8/events +index 851299d..9a3c74e 100644 +--- a/events/ppc64/power8/events ++++ b/events/ppc64/power8/events +@@ -1,4 +1,4 @@ +-# ++ + # Copyright OProfile authors + # Copyright (c) International Business Machines, 2013. + # Contributed by Maynard Johnson . +@@ -7,7 +7,7 @@ + + include:ppc64/architected_events_v1 + +-#note 1. 11/12/2015 ++# note 1. 11/12/2015 + # + # These event requires the cache selector bits to be set to a non-zero + # value in the processor performance counter setup register. On Power 8, this +@@ -19,7 +19,12 @@ include:ppc64/architected_events_v1 + # powerpc kernel file arch/powerpc/perf/power8-pmu.c in function power8_get_constraint() + # where the cache bits are ANDed with 0x7 if the unit is between 6 and 9. If + # cache bits are not zero, the function returns -1 to reject the event. +- ++# ++# note 2. ++# ++# To use all of the events listed in this file, you must have OProfile ++# complied with the libpfm 4.6 or newer library. Libpfm 4.5 supports a ++# subset of these events. + + event:0x1f05e counters:0 um:zero minimum:100000 name:PM_1LPAR_CYC : Number of cycles in single lpar mode. + event:0x2006e counters:1 um:zero minimum:10000 name:PM_2LPAR_CYC : Number of cycles in 2 lpar mode. +@@ -107,45 +112,45 @@ event:0x16082 counters:0 um:zero minimum:10000 name:PM_CO0_BUSY : CO mach 0 Busy + ## note 1 event:0x527084 counters:1 um:zero minimum:10000 name:PM_CO_TM_SC_FOOTPRINT : L2 did a cleanifdirty CO to the L3 (ie created an SC line in the L3) + event:0x3608a counters:2 um:zero minimum:10000 name:PM_CO_USAGE : Continuous 16 cycle(2to1) window where this signals rotates thru sampling each L2 CO machine busy. PMU uses this wave to then do 16 cyc count to sample total number of machs running + event:0x40066 counters:3 um:zero minimum:10000 name:PM_CRU_FIN : IFU Finished a (non-branch) instruction. +-## note 1 event:0x61c050 counters:0 um:zero minimum:10000 name:PM_DATA_ALL_CHIP_PUMP_CPRED : Initial and Final Pump Scope and data sourced across this scope was chip pump (prediction=correct) for a demand load +-## note 1 event:0x64c048 counters:3 um:zero minimum:10000 name:PM_DATA_ALL_FROM_DL2L3_MOD : The processor's data cache was reloaded with Modified (M) data from another chip's L2 or L3 on a different Node or Group (Distant), as this chip due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 +-## note 1 event:0x63c048 counters:2 um:zero minimum:10000 name:PM_DATA_ALL_FROM_DL2L3_SHR : The processor's data cache was reloaded with Shared (S) data from another chip's L2 or L3 on a different Node or Group (Distant), as this chip due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 +-## note 1 event:0x63c04c counters:2 um:zero minimum:10000 name:PM_DATA_ALL_FROM_DL4 : The processor's data cache was reloaded from another chip's L4 on a different Node or Group (Distant) due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 +-## note 1 event:0x64c04c counters:3 um:zero minimum:10000 name:PM_DATA_ALL_FROM_DMEM : The processor's data cache was reloaded from another chip's memory on the same Node or Group (Distant) due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 +-## note 1 event:0x61c042 counters:0 um:zero minimum:10000 name:PM_DATA_ALL_FROM_L2 : The processor's data cache was reloaded from local core's L2 due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 +-## note 1 event:0x64c046 counters:3 um:zero minimum:10000 name:PM_DATA_ALL_FROM_L21_MOD : The processor's data cache was reloaded with Modified (M) data from another core's L2 on the same chip due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 +-## note 1 event:0x63c046 counters:2 um:zero minimum:10000 name:PM_DATA_ALL_FROM_L21_SHR : The processor's data cache was reloaded with Shared (S) data from another core's L2 on the same chip due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 +-## note 1 event:0x61c04e counters:0 um:zero minimum:10000 name:PM_DATA_ALL_FROM_L2MISS_MOD : The processor's data cache was reloaded from a localtion other than the local core's L2 due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 +-## note 1 event:0x63c040 counters:2 um:zero minimum:10000 name:PM_DATA_ALL_FROM_L2_DISP_CONFLICT_LDHITST : The processor's data cache was reloaded from local core's L2 with load hit store conflict due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 +-## note 1 event:0x64c040 counters:3 um:zero minimum:10000 name:PM_DATA_ALL_FROM_L2_DISP_CONFLICT_OTHER : The processor's data cache was reloaded from local core's L2 with dispatch conflict due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 ++event:0x61c050 counters:0 um:zero minimum:10000 name:PM_DATA_ALL_CHIP_PUMP_CPRED : Initial and Final Pump Scope and data sourced across this scope was chip pump (prediction=correct) for a demand load ++event:0x64c048 counters:3 um:zero minimum:10000 name:PM_DATA_ALL_FROM_DL2L3_MOD : The processor's data cache was reloaded with Modified (M) data from another chip's L2 or L3 on a different Node or Group (Distant), as this chip due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 ++event:0x63c048 counters:2 um:zero minimum:10000 name:PM_DATA_ALL_FROM_DL2L3_SHR : The processor's data cache was reloaded with Shared (S) data from another chip's L2 or L3 on a different Node or Group (Distant), as this chip due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 ++event:0x63c04c counters:2 um:zero minimum:10000 name:PM_DATA_ALL_FROM_DL4 : The processor's data cache was reloaded from another chip's L4 on a different Node or Group (Distant) due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 ++event:0x64c04c counters:3 um:zero minimum:10000 name:PM_DATA_ALL_FROM_DMEM : The processor's data cache was reloaded from another chip's memory on the same Node or Group (Distant) due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 ++event:0x61c042 counters:0 um:zero minimum:10000 name:PM_DATA_ALL_FROM_L2 : The processor's data cache was reloaded from local core's L2 due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 ++event:0x64c046 counters:3 um:zero minimum:10000 name:PM_DATA_ALL_FROM_L21_MOD : The processor's data cache was reloaded with Modified (M) data from another core's L2 on the same chip due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 ++event:0x63c046 counters:2 um:zero minimum:10000 name:PM_DATA_ALL_FROM_L21_SHR : The processor's data cache was reloaded with Shared (S) data from another core's L2 on the same chip due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 ++event:0x61c04e counters:0 um:zero minimum:10000 name:PM_DATA_ALL_FROM_L2MISS_MOD : The processor's data cache was reloaded from a localtion other than the local core's L2 due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 ++event:0x63c040 counters:2 um:zero minimum:10000 name:PM_DATA_ALL_FROM_L2_DISP_CONFLICT_LDHITST : The processor's data cache was reloaded from local core's L2 with load hit store conflict due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 ++event:0x64c040 counters:3 um:zero minimum:10000 name:PM_DATA_ALL_FROM_L2_DISP_CONFLICT_OTHER : The processor's data cache was reloaded from local core's L2 with dispatch conflict due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 + event:0x62c040 counters:1 um:zero minimum:10000 name:PM_DATA_ALL_FROM_L2_MEPF : The processor's data cache was reloaded from local core's L2 hit without dispatch conflicts on Mepf state due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 +-## note 1 event:0x61c040 counters:0 um:zero minimum:10000 name:PM_DATA_ALL_FROM_L2_NO_CONFLICT : The processor's data cache was reloaded from local core's L2 without conflict due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 +-## note 1 event:0x64c042 counters:3 um:zero minimum:10000 name:PM_DATA_ALL_FROM_L3 : The processor's data cache was reloaded from local core's L3 due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 +-## note 1 event:0x64c044 counters:3 um:zero minimum:10000 name:PM_DATA_ALL_FROM_L31_ECO_MOD : The processor's data cache was reloaded with Modified (M) data from another core's ECO L3 on the same chip due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 +-## note 1 event:0x63c044 counters:2 um:zero minimum:10000 name:PM_DATA_ALL_FROM_L31_ECO_SHR : The processor's data cache was reloaded with Shared (S) data from another core's ECO L3 on the same chip due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 +-## note 1 event:0x62c044 counters:1 um:zero minimum:10000 name:PM_DATA_ALL_FROM_L31_MOD : The processor's data cache was reloaded with Modified (M) data from another core's L3 on the same chip due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 +-## note 1 event:0x61c046 counters:0 um:zero minimum:10000 name:PM_DATA_ALL_FROM_L31_SHR : The processor's data cache was reloaded with Shared (S) data from another core's L3 on the same chip due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 +-## note 1 event:0x64c04e counters:3 um:zero minimum:10000 name:PM_DATA_ALL_FROM_L3MISS_MOD : The processor's data cache was reloaded from a localtion other than the local core's L3 due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 +-## note 1 event:0x63c042 counters:2 um:zero minimum:10000 name:PM_DATA_ALL_FROM_L3_DISP_CONFLICT : The processor's data cache was reloaded from local core's L3 with dispatch conflict due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 +-## note 1 event:0x62c042 counters:1 um:zero minimum:10000 name:PM_DATA_ALL_FROM_L3_MEPF : The processor's data cache was reloaded from local core's L3 without dispatch conflicts hit on Mepf state due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 +-## note 1 event:0x61c044 counters:0 um:zero minimum:10000 name:PM_DATA_ALL_FROM_L3_NO_CONFLICT : The processor's data cache was reloaded from local core's L3 without conflict due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 +-## note 1 event:0x61c04c counters:0 um:zero minimum:10000 name:PM_DATA_ALL_FROM_LL4 : The processor's data cache was reloaded from the local chip's L4 cache due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 +-## note 1 event:0x62c048 counters:1 um:zero minimum:10000 name:PM_DATA_ALL_FROM_LMEM : The processor's data cache was reloaded from the local chip's Memory due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 +-## note 1 event:0x62c04c counters:1 um:zero minimum:10000 name:PM_DATA_ALL_FROM_MEMORY : The processor's data cache was reloaded from a memory location including L4 from local remote or distant due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 +-## note 1 event:0x64c04a counters:3 um:zero minimum:10000 name:PM_DATA_ALL_FROM_OFF_CHIP_CACHE : The processor's data cache was reloaded either shared or modified data from another core's L2/L3 on a different chip (remote or distant) due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 +-## note 1 event:0x61c048 counters:0 um:zero minimum:10000 name:PM_DATA_ALL_FROM_ON_CHIP_CACHE : The processor's data cache was reloaded either shared or modified data from another core's L2/L3 on the same chip due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 +-## note 1 event:0x62c046 counters:1 um:zero minimum:10000 name:PM_DATA_ALL_FROM_RL2L3_MOD : The processor's data cache was reloaded with Modified (M) data from another chip's L2 or L3 on the same Node or Group (Remote), as this chip due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 +-## note 1 event:0x61c04a counters:0 um:zero minimum:10000 name:PM_DATA_ALL_FROM_RL2L3_SHR : The processor's data cache was reloaded with Shared (S) data from another chip's L2 or L3 on the same Node or Group (Remote), as this chip due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 +-## note 1 event:0x62c04a counters:1 um:zero minimum:10000 name:PM_DATA_ALL_FROM_RL4 : The processor's data cache was reloaded from another chip's L4 on the same Node or Group ( Remote) due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 +-## note 1 event:0x63c04a counters:2 um:zero minimum:10000 name:PM_DATA_ALL_FROM_RMEM : The processor's data cache was reloaded from another chip's memory on the same Node or Group ( Remote) due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 +-## note 1 event:0x62c050 counters:1 um:zero minimum:10000 name:PM_DATA_ALL_GRP_PUMP_CPRED : Initial and Final Pump Scope and data sourced across this scope was group pump for a demand load +-## note 1 event:0x62c052 counters:1 um:zero minimum:10000 name:PM_DATA_ALL_GRP_PUMP_MPRED : Final Pump Scope(Group) to get data sourced, ended up larger than Initial Pump Scope OR Final Pump Scope(Group) got data from source that was at smaller scope(Chip) Final pump was group pump and initial pump was chip or final and initial pump was gro +-## note 1 event:0x61c052 counters:0 um:zero minimum:10000 name:PM_DATA_ALL_GRP_PUMP_MPRED_RTY : Final Pump Scope(Group) to get data sourced, ended up larger than Initial Pump Scope (Chip) Final pump was group pump and initial pump was chip pumpfor a demand load +-## note 1 event:0x61c054 counters:0 um:zero minimum:10000 name:PM_DATA_ALL_PUMP_CPRED : Pump prediction correct. Counts across all types of pumps for a demand load +-## note 1 event:0x64c052 counters:3 um:zero minimum:10000 name:PM_DATA_ALL_PUMP_MPRED : Pump Mis prediction Counts across all types of pumpsfor a demand load +-## note 1 event:0x63c050 counters:2 um:zero minimum:10000 name:PM_DATA_ALL_SYS_PUMP_CPRED : Initial and Final Pump Scope and data sourced across this scope was system pump for a demand load +-## note 1 event:0x63c052 counters:2 um:zero minimum:10000 name:PM_DATA_ALL_SYS_PUMP_MPRED : Final Pump Scope(system) to get data sourced, ended up larger than Initial Pump Scope(Chip/Group) OR Final Pump Scope(system) got data from source that was at smaller scope(Chip/group) Final pump was system pump and initial pump was chip or group or +-## note 1 event:0x64c050 counters:3 um:zero minimum:10000 name:PM_DATA_ALL_SYS_PUMP_MPRED_RTY : Final Pump Scope(system) to get data sourced, ended up larger than Initial Pump Scope (Chip or Group) for a demand load ++event:0x61c040 counters:0 um:zero minimum:10000 name:PM_DATA_ALL_FROM_L2_NO_CONFLICT : The processor's data cache was reloaded from local core's L2 without conflict due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 ++event:0x64c042 counters:3 um:zero minimum:10000 name:PM_DATA_ALL_FROM_L3 : The processor's data cache was reloaded from local core's L3 due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 ++event:0x64c044 counters:3 um:zero minimum:10000 name:PM_DATA_ALL_FROM_L31_ECO_MOD : The processor's data cache was reloaded with Modified (M) data from another core's ECO L3 on the same chip due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 ++event:0x63c044 counters:2 um:zero minimum:10000 name:PM_DATA_ALL_FROM_L31_ECO_SHR : The processor's data cache was reloaded with Shared (S) data from another core's ECO L3 on the same chip due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 ++event:0x62c044 counters:1 um:zero minimum:10000 name:PM_DATA_ALL_FROM_L31_MOD : The processor's data cache was reloaded with Modified (M) data from another core's L3 on the same chip due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 ++event:0x61c046 counters:0 um:zero minimum:10000 name:PM_DATA_ALL_FROM_L31_SHR : The processor's data cache was reloaded with Shared (S) data from another core's L3 on the same chip due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 ++event:0x64c04e counters:3 um:zero minimum:10000 name:PM_DATA_ALL_FROM_L3MISS_MOD : The processor's data cache was reloaded from a localtion other than the local core's L3 due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 ++event:0x63c042 counters:2 um:zero minimum:10000 name:PM_DATA_ALL_FROM_L3_DISP_CONFLICT : The processor's data cache was reloaded from local core's L3 with dispatch conflict due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 ++event:0x62c042 counters:1 um:zero minimum:10000 name:PM_DATA_ALL_FROM_L3_MEPF : The processor's data cache was reloaded from local core's L3 without dispatch conflicts hit on Mepf state due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 ++event:0x61c044 counters:0 um:zero minimum:10000 name:PM_DATA_ALL_FROM_L3_NO_CONFLICT : The processor's data cache was reloaded from local core's L3 without conflict due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 ++event:0x61c04c counters:0 um:zero minimum:10000 name:PM_DATA_ALL_FROM_LL4 : The processor's data cache was reloaded from the local chip's L4 cache due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 ++event:0x62c048 counters:1 um:zero minimum:10000 name:PM_DATA_ALL_FROM_LMEM : The processor's data cache was reloaded from the local chip's Memory due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 ++event:0x62c04c counters:1 um:zero minimum:10000 name:PM_DATA_ALL_FROM_MEMORY : The processor's data cache was reloaded from a memory location including L4 from local remote or distant due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 ++event:0x64c04a counters:3 um:zero minimum:10000 name:PM_DATA_ALL_FROM_OFF_CHIP_CACHE : The processor's data cache was reloaded either shared or modified data from another core's L2/L3 on a different chip (remote or distant) due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 ++event:0x61c048 counters:0 um:zero minimum:10000 name:PM_DATA_ALL_FROM_ON_CHIP_CACHE : The processor's data cache was reloaded either shared or modified data from another core's L2/L3 on the same chip due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 ++event:0x62c046 counters:1 um:zero minimum:10000 name:PM_DATA_ALL_FROM_RL2L3_MOD : The processor's data cache was reloaded with Modified (M) data from another chip's L2 or L3 on the same Node or Group (Remote), as this chip due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 ++event:0x61c04a counters:0 um:zero minimum:10000 name:PM_DATA_ALL_FROM_RL2L3_SHR : The processor's data cache was reloaded with Shared (S) data from another chip's L2 or L3 on the same Node or Group (Remote), as this chip due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 ++event:0x62c04a counters:1 um:zero minimum:10000 name:PM_DATA_ALL_FROM_RL4 : The processor's data cache was reloaded from another chip's L4 on the same Node or Group ( Remote) due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 ++event:0x63c04a counters:2 um:zero minimum:10000 name:PM_DATA_ALL_FROM_RMEM : The processor's data cache was reloaded from another chip's memory on the same Node or Group ( Remote) due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 ++event:0x62c050 counters:1 um:zero minimum:10000 name:PM_DATA_ALL_GRP_PUMP_CPRED : Initial and Final Pump Scope and data sourced across this scope was group pump for a demand load ++event:0x62c052 counters:1 um:zero minimum:10000 name:PM_DATA_ALL_GRP_PUMP_MPRED : Final Pump Scope(Group) to get data sourced, ended up larger than Initial Pump Scope OR Final Pump Scope(Group) got data from source that was at smaller scope(Chip) Final pump was group pump and initial pump was chip or final and initial pump was gro ++event:0x61c052 counters:0 um:zero minimum:10000 name:PM_DATA_ALL_GRP_PUMP_MPRED_RTY : Final Pump Scope(Group) to get data sourced, ended up larger than Initial Pump Scope (Chip) Final pump was group pump and initial pump was chip pumpfor a demand load ++event:0x61c054 counters:0 um:zero minimum:10000 name:PM_DATA_ALL_PUMP_CPRED : Pump prediction correct. Counts across all types of pumps for a demand load ++event:0x64c052 counters:3 um:zero minimum:10000 name:PM_DATA_ALL_PUMP_MPRED : Pump Mis prediction Counts across all types of pumpsfor a demand load ++event:0x63c050 counters:2 um:zero minimum:10000 name:PM_DATA_ALL_SYS_PUMP_CPRED : Initial and Final Pump Scope and data sourced across this scope was system pump for a demand load ++event:0x63c052 counters:2 um:zero minimum:10000 name:PM_DATA_ALL_SYS_PUMP_MPRED : Final Pump Scope(system) to get data sourced, ended up larger than Initial Pump Scope(Chip/Group) OR Final Pump Scope(system) got data from source that was at smaller scope(Chip/group) Final pump was system pump and initial pump was chip or group or ++event:0x64c050 counters:3 um:zero minimum:10000 name:PM_DATA_ALL_SYS_PUMP_MPRED_RTY : Final Pump Scope(system) to get data sourced, ended up larger than Initial Pump Scope (Chip or Group) for a demand load + event:0x1c050 counters:0 um:zero minimum:10000 name:PM_DATA_CHIP_PUMP_CPRED : Initial and Final Pump Scope and data sourced across this scope was chip pump (prediction=correct) for a demand load. + event:0x4c048 counters:3 um:zero minimum:10000 name:PM_DATA_FROM_DL2L3_MOD : The processor's data cache was reloaded with Modified (M) data from another chip's L2 or L3 on a different Node or Group (Distant), as this chip due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1. + event:0x3c048 counters:2 um:zero minimum:10000 name:PM_DATA_FROM_DL2L3_SHR : The processor's data cache was reloaded with Shared (S) data from another chip's L2 or L3 on a different Node or Group (Distant), as this chip due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1. +@@ -465,10 +470,10 @@ event:0x30a8 counters:0,1,2,3 um:zero minimum:10000 name:PM_ISU_REJ_VS0 : VS0 IS + event:0x30aa counters:0,1,2,3 um:zero minimum:10000 name:PM_ISU_REJ_VS1 : VS1 ISU reject + event:0x38a8 counters:0,1,2,3 um:zero minimum:10000 name:PM_ISU_REJ_VSU : ISU + event:0x30b8 counters:0,1,2,3 um:zero minimum:10000 name:PM_ISYNC : Isync count per thread +-## note1 event:0x200301ea counters:2 um:L1_latency minimum:10000 name:PM_L1MISS_LAT_EXC_1024 : Reload latency exceeded 1024 cyc +-## note1 event:0x200401ec counters:3 um:L1_latency minimum:10000 name:PM_L1MISS_LAT_EXC_2048 : Reload latency exceeded 2048 cyc +-## note1 event:0x200101e8 counters:0 um:L1_latency minimum:10000 name:PM_L1MISS_LAT_EXC_256 : Reload latency exceeded 256 cyc +-## note1 event:0x200201e6 counters:1 um:L1_latency minimum:10000 name:PM_L1MISS_LAT_EXC_32 : Reload latency exceeded 32 cyc ++event:0x200301ea counters:2 um:L1_latency minimum:10000 name:PM_L1MISS_LAT_EXC_1024 : Reload latency exceeded 1024 cyc ++event:0x200401ec counters:3 um:L1_latency minimum:10000 name:PM_L1MISS_LAT_EXC_2048 : Reload latency exceeded 2048 cyc ++event:0x200101e8 counters:0 um:L1_latency minimum:10000 name:PM_L1MISS_LAT_EXC_256 : Reload latency exceeded 256 cyc ++event:0x200201e6 counters:1 um:L1_latency minimum:10000 name:PM_L1MISS_LAT_EXC_32 : Reload latency exceeded 32 cyc + event:0x26086 counters:1 um:zero minimum:10000 name:PM_L1PF_L2MEMACC : valid when first beat of data comes in for an L1pref where data came from mem(or L4) + event:0x1002c counters:0 um:zero minimum:10000 name:PM_L1_DCACHE_RELOADED_ALL : L1 data cache reloaded for demand or prefetch . + event:0x408c counters:0,1,2,3 um:zero minimum:10000 name:PM_L1_DEMAND_WRITE : Instruction Demand sectors wriittent into IL1 +@@ -512,15 +517,15 @@ event:0x17082 counters:0 um:zero minimum:10000 name:PM_L2_ST_MISS : All successf + event:0x37088 counters:2 um:zero minimum:10000 name:PM_L2_SYS_PUMP : RC requests that were system pump attempts + event:0x1e05e counters:0 um:zero minimum:10000 name:PM_L2_TM_REQ_ABORT : TM abort. + event:0x3e05c counters:2 um:zero minimum:10000 name:PM_L2_TM_ST_ABORT_SISTER : TM marked store abort. +-## note1 event:0x23808a counters:2 um:zero minimum:10000 name:PM_L3_CINJ : l3 ci of cache inject +-## note1 event:0x128084 counters:1 um:zero minimum:10000 name:PM_L3_CI_HIT : L3 Castins Hit (total count +-## note1 event:0x128086 counters:1 um:zero minimum:10000 name:PM_L3_CI_MISS : L3 castins miss (total count ++## note 1 event:0x23808a counters:2 um:zero minimum:10000 name:PM_L3_CINJ : l3 ci of cache inject ++## note 1 event:0x128084 counters:1 um:zero minimum:10000 name:PM_L3_CI_HIT : L3 Castins Hit (total count ++## note 1 event:0x128086 counters:1 um:zero minimum:10000 name:PM_L3_CI_MISS : L3 castins miss (total count + event:0x819082 counters:0 um:zero minimum:10000 name:PM_L3_CI_USAGE : rotating sample of 16 CI or CO actives + ## note 1 event:0x438088 counters:2 um:zero minimum:10000 name:PM_L3_CO : l3 castout occuring ( does not include casthrough or log writes (cinj/dmaw) + event:0x83908b counters:2 um:zero minimum:10000 name:PM_L3_CO0_ALLOC : 0.0 + event:0x83908a counters:2 um:zero minimum:10000 name:PM_L3_CO0_BUSY : lifetime, sample of CO machine 0 valid + event:0x28086 counters:1 um:zero minimum:10000 name:PM_L3_CO_L31 : L3 CO to L3.1 OR of port 0 and 1 ( lossy) +-## note1 event:0x238088 counters:2 um:zero minimum:10000 name:PM_L3_CO_LCO : Total L3 castouts occurred on LCO ++## note 1 event:0x238088 counters:2 um:zero minimum:10000 name:PM_L3_CO_LCO : Total L3 castouts occurred on LCO + event:0x28084 counters:1 um:zero minimum:10000 name:PM_L3_CO_MEM : L3 CO to memory OR of port 0 and 1 ( lossy) + event:0x18082 counters:0 um:zero minimum:10000 name:PM_L3_CO_MEPF : L3 CO of line in Mep state ( includes casthrough) + ## note 1 event:0xb19082 counters:0 um:zero minimum:10000 name:PM_L3_GRP_GUESS_CORRECT : Initial scope=group and data from same group (near) (pred successful) +@@ -893,10 +898,10 @@ event:0x10054 counters:0 um:zero minimum:10000 name:PM_PUMP_CPRED : Pump predict + event:0x40052 counters:3 um:zero minimum:10000 name:PM_PUMP_MPRED : Pump Mis prediction Counts across all types of pumpsfor all data types excluding data prefetch (demand load,inst prefetch,inst fetch,xlate). + event:0x16081 counters:0 um:zero minimum:10000 name:PM_RC0_ALLOC : 0.0 + event:0x16080 counters:0 um:zero minimum:10000 name:PM_RC0_BUSY : RC mach 0 Busy. Used by PMU to sample ave RC livetime(mach0 used as sample point) +-## note 1 event:0x200301ea counters:2 um:rc_machine minimum:10000 name:PM_RC_LIFETIME_EXC_1024 : Reload latency exceeded 1024 cyc +-## note 1 event:0x200401ec counters:3 um:rc_machine minimum:10000 name:PM_RC_LIFETIME_EXC_2048 : Threshold counter exceeded a value of 2048 +-## note 1 event:0x200101e8 counters:0 um:rc_machine minimum:10000 name:PM_RC_LIFETIME_EXC_256 : Threshold counter exceed a count of 256 +-## note 1 event:0x200201e6 counters:1 um:rc_machine minimum:10000 name:PM_RC_LIFETIME_EXC_32 : Reload latency exceeded 32 cyc ++event:0x200301ea counters:2 um:rc_machine minimum:10000 name:PM_RC_LIFETIME_EXC_1024 : Reload latency exceeded 1024 cyc ++event:0x200401ec counters:3 um:rc_machine minimum:10000 name:PM_RC_LIFETIME_EXC_2048 : Threshold counter exceeded a value of 2048 ++event:0x200101e8 counters:0 um:rc_machine minimum:10000 name:PM_RC_LIFETIME_EXC_256 : Threshold counter exceed a count of 256 ++event:0x200201e6 counters:1 um:rc_machine minimum:10000 name:PM_RC_LIFETIME_EXC_32 : Reload latency exceeded 32 cyc + event:0x36088 counters:2 um:zero minimum:10000 name:PM_RC_USAGE : Continuous 16 cycle(2to1) window where this signals rotates thru sampling each L2 RC machine busy. PMU uses this wave to then do 16 cyc count to sample total number of machs running + ## note 1 event:0x34808e counters:3 um:zero minimum:10000 name:PM_RD_CLEARING_SC : rd clearing sc + ## note 1 event:0x34808c counters:3 um:zero minimum:10000 name:PM_RD_FORMING_SC : rd forming sc +diff --git a/events/ppc64/power8/unit_masks b/events/ppc64/power8/unit_masks +index 96b32c0..391f363 100644 +--- a/events/ppc64/power8/unit_masks ++++ b/events/ppc64/power8/unit_masks +@@ -10,25 +10,9 @@ + # See libpe_utils/op_pe_utils.cpp:_get_event_code for how these codes are + # used. + # +-#note 1. 11/12/2015 +-# +-# Some event requires the cache selector bits to be set to a non-zero +-# value in the processor performance counter setup register. On Power 8, this +-# register is only writable by the hypervisor. So the kernel must reject any +-# event where the lower three cache selector bits (bits 22:20) are not equal +-# to 0. If/when an API is implemented to allow the kernel to request the +-# hypervisor write the register with the required value, these events can be +-# re-added to the list of supported events. The issue is documented in the +-# powerpc kernel file arch/powerpc/perf/power8-pmu.c in function power8_get_constraint() +-# where the cache bits are ANDed with 0x7 if the unit is between 6 and 9. If +-# cache bits are not zero, the function returns -1 to reject the event. +-# +-# The associated unit masks for these problem events are unused and also need +-# to be commented out. +-# + name:zero type:mandatory default:0x0 + 0x0 No unit mask +-## note 1 name:rc_machine type:mandatory default:0xde +-## note 1 0xde Thresholdable start/stop for rc machine for sampled instruction +-## note 1 name:L1_latency type:mandatory default:0x67 +-## note 1 0x67 Thresholdable start/stop for L1 sampled instruction load miss/reload ++name:rc_machine type:mandatory default:0xde ++ 0xde Thresholdable start/stop for rc machine for sampled instruction ++name:L1_latency type:mandatory default:0x67 ++ 0x67 Thresholdable start/stop for L1 sampled instruction load miss/reload +commit 6fcd5aa57482a58fcb0166982fed517fbf7040fb +Author: Carl E. Love +Date: Thu Mar 17 13:49:41 2016 -0700 + + POWER 8 processor event spelling fixes + + Will: + + Here is a patch to fix the spelling errors in the Power 8 events. See + OProfile bugzilla number 281. This patch corrects the spelling errors. + + Carl Love + ------------------------------------------------------------ + + POWER 8 processor event spelling fixes. + + Fixed the spelling of six of the events. + + Signed-off-by: Carl E. Love + +diff --git a/events/ppc64/power8/events b/events/ppc64/power8/events +index 9a3c74e..b7f7ee2 100644 +--- a/events/ppc64/power8/events ++++ b/events/ppc64/power8/events +@@ -455,13 +455,13 @@ event:0x3504a counters:2 um:zero minimum:10000 name:PM_IPTEG_FROM_RMEM : A Page + event:0x4608e counters:3 um:zero minimum:10000 name:PM_ISIDE_L2MEMACC : valid when first beat of data comes in for an i-side fetch where data came from mem(or L4) + ## note 1 event:0x44608e counters:3 um:zero minimum:10000 name:PM_ISIDE_MRU_TOUCH : Iside L2 MRU touch + event:0xd096 counters:0,1,2,3 um:zero minimum:10000 name:PM_ISLB_MISS : I SLB Miss. +-event:0x30ac counters:0,1,2,3 um:zero minimum:10000 name:PM_ISU_REF_FX0 : FX0 ISU reject +-event:0x30ae counters:0,1,2,3 um:zero minimum:10000 name:PM_ISU_REF_FX1 : FX1 ISU reject ++event:0x30ac counters:0,1,2,3 um:zero minimum:10000 name:PM_ISU_REJ_FX0 : FX0 ISU reject ++event:0x30ae counters:0,1,2,3 um:zero minimum:10000 name:PM_ISU_REJ_FX1 : FX1 ISU reject + event:0x38ac counters:0,1,2,3 um:zero minimum:10000 name:PM_ISU_REF_FXU : ISU +-event:0x30b0 counters:0,1,2,3 um:zero minimum:10000 name:PM_ISU_REF_LS0 : LS0 ISU reject +-event:0x30b2 counters:0,1,2,3 um:zero minimum:10000 name:PM_ISU_REF_LS1 : LS1 ISU reject +-event:0x30b4 counters:0,1,2,3 um:zero minimum:10000 name:PM_ISU_REF_LS2 : LS2 ISU reject +-event:0x30b6 counters:0,1,2,3 um:zero minimum:10000 name:PM_ISU_REF_LS3 : LS3 ISU reject ++event:0x30b0 counters:0,1,2,3 um:zero minimum:10000 name:PM_ISU_REJ_LS0 : LS0 ISU reject ++event:0x30b2 counters:0,1,2,3 um:zero minimum:10000 name:PM_ISU_REJ_LS1 : LS1 ISU reject ++event:0x30b4 counters:0,1,2,3 um:zero minimum:10000 name:PM_ISU_REJ_LS2 : LS2 ISU reject ++event:0x30b6 counters:0,1,2,3 um:zero minimum:10000 name:PM_ISU_REJ_LS3 : LS3 ISU reject + event:0x309c counters:0,1,2,3 um:zero minimum:10000 name:PM_ISU_REJECTS_ALL : All isu rejects could be more than 1 per cycle + event:0x30a2 counters:0,1,2,3 um:zero minimum:10000 name:PM_ISU_REJECT_RES_NA : ISU reject due to resource not available + event:0x309e counters:0,1,2,3 um:zero minimum:10000 name:PM_ISU_REJECT_SAR_BYPASS : Reject because of SAR bypass diff --git a/SOURCES/oprofile-rhbz1426426.patch b/SOURCES/oprofile-rhbz1426426.patch new file mode 100644 index 0000000..2ee35a1 --- /dev/null +++ b/SOURCES/oprofile-rhbz1426426.patch @@ -0,0 +1,30 @@ +commit 6b4aaf9a6c810be7c696b9edb9ad232a02a83e0b +Author: Carl E. Love +Date: Mon Dec 19 09:31:13 2016 -0800 + + Add support for the IBM POWER 8 NV and NVL variants. + + Add support for the IBM POWER 8 NV and NVL variants. + + The processor performance counter unit is identical for the various + variants of the POWER 8 processor. The variants vary some for the + non-cpu events. + + Signed-off-by: Carl Love + +diff --git a/libop/op_cpu_type.c b/libop/op_cpu_type.c +index d03f9d6..0e1f6e4 100644 +--- a/libop/op_cpu_type.c ++++ b/libop/op_cpu_type.c +@@ -330,6 +330,11 @@ static op_cpu _get_ppc64_cpu_type(void) + if (strncmp(cpu_name_lowercase, "power8e", 7) == 0) + cpu_name_lowercase[6] = '\0'; + ++ /* The POWER8NV and POWER8NVL variants have the same core PMU events as ++ * POWER8. */ ++ if (strncmp(cpu_name_lowercase, "power8nv", 8) == 0) ++ cpu_name_lowercase[6] = '\0'; ++ + cpu_type_str[0] = '\0'; + strcat(cpu_type_str, "ppc64/"); + strncat(cpu_type_str, cpu_name_lowercase, len); diff --git a/SOURCES/oprofile-silvermont.patch b/SOURCES/oprofile-silvermont.patch new file mode 100644 index 0000000..3ddd04e --- /dev/null +++ b/SOURCES/oprofile-silvermont.patch @@ -0,0 +1,434 @@ +commit 4b1497d8befcc4c8b26dc4e4866c3422ae8787c3 +Author: Andi Kleen +Date: Thu Oct 10 13:12:28 2013 -0500 + + Add support for Intel Silvermont processor + + Just add the event list for Intel Silvermont based systems + (Avoton, BayTrail) and the usual changes for a new CPU. + No new code otherwise. + + The model number list is incomplete at this point, more will + be added in the future. + + I also finally removed the top level event list descriptions. + All the events are only described in the unit masks now + (Intel doesn't really have a top level event, and I had + to invent descriptions, which was error prone and + often wrong) + + I also removed some outdated document number references. + + Signed-off-by: Andi Kleen + +diff --git a/events/Makefile.am b/events/Makefile.am +index d91d44b..3028c2f 100644 +--- a/events/Makefile.am ++++ b/events/Makefile.am +@@ -21,6 +21,7 @@ event_files = \ + i386/sandybridge/events i386/sandybridge/unit_masks \ + i386/ivybridge/events i386/ivybridge/unit_masks \ + i386/haswell/events i386/haswell/unit_masks \ ++ i386/silvermont/events i386/silvermont/unit_masks \ + ia64/ia64/events ia64/ia64/unit_masks \ + ia64/itanium2/events ia64/itanium2/unit_masks \ + ia64/itanium/events ia64/itanium/unit_masks \ +diff --git a/events/i386/silvermont/events b/events/i386/silvermont/events +new file mode 100644 +index 0000000..077cc0a +--- /dev/null ++++ b/events/i386/silvermont/events +@@ -0,0 +1,26 @@ ++# ++# Intel "Silvermont" microarchitecture core events. ++# ++# See http://ark.intel.com/ for help in identifying Silvermont based CPUs ++# ++# Note the minimum counts are not discovered experimentally and could be likely ++# lowered in many cases without ill effect. ++# ++include:i386/arch_perfmon ++event:0x32 counters:0,1 um:l2_prefetcher_throttle minimum:200003 name:l2_prefetcher_throttle : ++event:0x3e counters:0,1 um:one minimum:200003 name:l2_prefetcher_pref_stream_alloc : ++event:0x50 counters:0,1 um:zero minimum:200003 name:l2_prefetch_pend_streams_pref_stream_pend_set : ++event:0x86 counters:0,1 um:nip_stall minimum:200003 name:nip_stall : ++event:0x87 counters:0,1 um:decode_stall minimum:200003 name:decode_stall : ++event:0x96 counters:0,1 um:uip_match minimum:200003 name:uip_match : ++event:0xc2 counters:0,1 um:uops_retired minimum:2000003 name:uops_retired : ++event:0xc3 counters:0,1 um:x10 minimum:200003 name:machine_clears_live_lock_breaker : ++event:0xc4 counters:0,1 um:br_inst_retired minimum:2000003 name:br_inst_retired : ++event:0xc5 counters:0,1 um:br_misp_retired minimum:200003 name:br_misp_retired : ++event:0xca counters:0,1 um:no_alloc_cycles minimum:200003 name:no_alloc_cycles : ++event:0xcb counters:0,1 um:rs_full_stall minimum:200003 name:rs_full_stall : ++event:0xcc counters:0,1 um:rs_dispatch_stall minimum:200003 name:rs_dispatch_stall : ++event:0xe6 counters:0,1 um:baclears minimum:2000003 name:baclears : ++event:0xe7 counters:0,1 um:x02 minimum:200003 name:ms_decoded_early_exit : ++event:0xe8 counters:0,1 um:one minimum:200003 name:btclears_all : ++event:0xe9 counters:0,1 um:decode_restriction minimum:200003 name:decode_restriction : +diff --git a/events/i386/silvermont/unit_masks b/events/i386/silvermont/unit_masks +new file mode 100644 +index 0000000..6309282 +--- /dev/null ++++ b/events/i386/silvermont/unit_masks +@@ -0,0 +1,71 @@ ++# ++# Unit masks for the Intel "Silvermont" micro architecture ++# ++# See http://ark.intel.com/ for help in identifying Silvermont based CPUs ++# ++include:i386/arch_perfmon ++name:x02 type:mandatory default:0x2 ++ 0x2 No unit mask ++name:x10 type:mandatory default:0x10 ++ 0x10 No unit mask ++name:l2_prefetcher_throttle type:exclusive default:0x2 ++ 0x2 extra:edge conservative Counts the number of cycles the L2 prefetcher spends in throttling mode ++ 0x1 extra:edge aggressive Counts the number of cycles the L2 prefetcher spends in throttling mode ++name:nip_stall type:exclusive default:0x3f ++ 0x3f extra: all Counts the number of cycles the NIP stalls. ++ 0x1 extra: pfb_full Counts the number of cycles the NIP stalls and the PFBs are full. This DOES NOT inlude PFB throttler cases. ++ 0x2 extra: itlb_miss Counts the number of cycles the NIP stalls and there is an outstanding ITLB miss. This is a cummulative count of cycles the NIP stalled for all ITLB misses. ++ 0x8 extra: pfb_throttler Counts the number of cycles the NIP stalls, the throttler is engaged, and the PFBs appear full. ++ 0x10 extra: do_snoop Counts the number of cycles the NIP stalls because of a SMC compliance snoop to the MEC is required. ++ 0x20 extra: misc_other Counts the number of cycles the NIP stalls due to NUKE, Stop Front End, Inserted flows. ++ 0x1e extra: pfb_ready Counts the number of cycles the NIP stalls when the PFBs are not full and the decoders are able to process bytes. Does not count PFB_FULL nor MISC_OTHER stall cycles. ++name:decode_stall type:exclusive default:0x1 ++ 0x1 extra: pfb_empty Counts the number of cycles decoder is stalled because the PFB is empty, this count is useful to see if the decoder is receiving the bytes from the front end. This event together with the DECODE_STALL.IQ_FULL may be used to narrow down on the bottleneck. ++ 0x2 extra: iq_full Counts the number of cycles decoder is stalled because the IQ is full, this count is useful to see if the decoder is delivering the decoded uops. This event together with the DECODE_STALL.PFB_EMPTY may be used to narrow down on the bottleneck. ++name:uip_match type:exclusive default:0x1 ++ 0x1 extra: first_uip This event is used for counting the number of times a specific micro IP address was decoded ++ 0x2 extra: second_uip This event is used for counting the number of times a specific micro IP address was decoded ++name:uops_retired type:exclusive default:0x2 ++ 0x2 extra: x87 This event counts the number of micro-ops retired that used X87 hardware. ++ 0x4 extra: mul This event counts the number of micro-ops retired that used MUL hardware. ++ 0x8 extra: div This event counts the number of micro-ops retired that used DIV hardware. ++ 0x1 extra: ms_cyles Counts the number of uops that are from the complex flows issued by the micro-sequencer (MS). This includes uops from flows due to faults, assists, and inserted flows. ++name:br_inst_retired type:exclusive default:0x1 ++ 0x1 extra: remove_jcc REMOVE_JCC counts the number of branch instructions retired but removes taken and not taken conditional branches (JCC). Branch prediction predicts the branch target and enables the processor to begin executing instructions long before the branch true execution path is known. All branches utilize the branch prediction unit (BPU) for prediction. This unit predicts the target address not only based on the EIP of the branch but also based on the execution path through which execution reached this EIP. The BPU can efficiently predict the following branch types: conditional branches, direct calls and jumps, indirect calls and jumps, returns. ++ 0x2 extra: remove_rel_call REMOVE_REL_CALL counts the number of branch instructions retired but removes near relative CALL. Branch prediction predicts the branch target and enables the processor to begin executing instructions long before the branch true execution path is known. All branches utilize the branch prediction unit (BPU) for prediction. This unit predicts the target address not only based on the EIP of the branch but also based on the execution path through which execution reached this EIP. The BPU can efficiently predict the following branch types: conditional branches, direct calls and jumps, indirect calls and jumps, returns. ++ 0x4 extra: remove_ind_call REMOVE_IND_CALL counts the number of branch instructions retired but removes near indirect CALL. Branch prediction predicts the branch target and enables the processor to begin executing instructions long before the branch true execution path is known. All branches utilize the branch prediction unit (BPU) for prediction. This unit predicts the target address not only based on the EIP of the branch but also based on the execution path through which execution reached this EIP. The BPU can efficiently predict the following branch types: conditional branches, direct calls and jumps, indirect calls and jumps, returns. ++ 0x8 extra: remove_ret REMOVE_RET counts the number of branch instructions retired but removes near RET. Branch prediction predicts the branch target and enables the processor to begin executing instructions long before the branch true execution path is known. All branches utilize the branch prediction unit (BPU) for prediction. This unit predicts the target address not only based on the EIP of the branch but also based on the execution path through which execution reached this EIP. The BPU can efficiently predict the following branch types: conditional branches, direct calls and jumps, indirect calls and jumps, returns. ++ 0x10 extra: remove_ind_jmp REMOVE_IND_JMP counts the number of branch instructions retired but removes near indirect JMP. Branch prediction predicts the branch target and enables the processor to begin executing instructions long before the branch true execution path is known. All branches utilize the branch prediction unit (BPU) for prediction. This unit predicts the target address not only based on the EIP of the branch but also based on the execution path through which execution reached this EIP. The BPU can efficiently predict the following branch types: conditional branches, direct calls and jumps, indirect calls and jumps, returns. ++ 0x20 extra: remove_rel_jmp REMOVE_REL_JMP counts the number of branch instructions retired but removes near relative JMP. Branch prediction predicts the branch target and enables the processor to begin executing instructions long before the branch true execution path is known. All branches utilize the branch prediction unit (BPU) for prediction. This unit predicts the target address not only based on the EIP of the branch but also based on the execution path through which execution reached this EIP. The BPU can efficiently predict the following branch types: conditional branches, direct calls and jumps, indirect calls and jumps, returns. ++ 0x40 extra: remove_far REMOVE_FAR counts the number of branch instructions retired but removes all far branches. Branch prediction predicts the branch target and enables the processor to begin executing instructions long before the branch true execution path is known. All branches utilize the branch prediction unit (BPU) for prediction. This unit predicts the target address not only based on the EIP of the branch but also based on the execution path through which execution reached this EIP. The BPU can efficiently predict the following branch types: conditional branches, direct calls and jumps, indirect calls and jumps, returns. ++ 0x80 extra: remove_not_taken_jcc REMOVE_NOT_TAKEN_JCC counts the number of branch instructions retired but removes taken conditional branches (JCC). Branch prediction predicts the branch target and enables the processor to begin executing instructions long before the branch true execution path is known. All branches utilize the branch prediction unit (BPU) for prediction. This unit predicts the target address not only based on the EIP of the branch but also based on the execution path through which execution reached this EIP. The BPU can efficiently predict the following branch types: conditional branches, direct calls and jumps, indirect calls and jumps, returns. ++name:br_misp_retired type:exclusive default:0x1 ++ 0x1 extra: remove_jcc REMOVE_JCC counts the number of mispredicted branch instructions retired but removes taken and not taken conditional branches (JCC). This event counts the number of retired branch instructions that were mispredicted by the processor, categorized by type. A branch misprediction occurs when the processor predicts that the branch would be taken, but it is not, or vice-versa. When the misprediction is discovered, all the instructions executed in the wrong (speculative) path must be discarded, and the processor must start fetching from the correct path. ++ 0x4 extra: remove_ind_call REMOVE_IND_CALL Counts the number of mispredicted branch instructions retired but removes near indirect CALL. This event counts the number of retired branch instructions that were mispredicted by the processor, categorized by type. A branch misprediction occurs when the processor predicts that the branch would be taken, but it is not, or vice-versa. When the misprediction is discovered, all the instructions executed in the wrong (speculative) path must be discarded, and the processor must start fetching from the correct path. ++ 0x8 extra: remove_ret REMOVE_RET Counts the number of mispredicted branch instructions retired but removes near RET. This event counts the number of retired branch instructions that were mispredicted by the processor, categorized by type. A branch misprediction occurs when the processor predicts that the branch would be taken, but it is not, or vice-versa. When the misprediction is discovered, all the instructions executed in the wrong (speculative) path must be discarded, and the processor must start fetching from the correct path. ++ 0x10 extra: remove_ind_jmp REMOVE_IND_JMP counts the number of mispredicted branch instructions retired but removes near indirect JMP. This event counts the number of retired branch instructions that were mispredicted by the processor, categorized by type. A branch misprediction occurs when the processor predicts that the branch would be taken, but it is not, or vice-versa. When the misprediction is discovered, all the instructions executed in the wrong (speculative) path must be discarded, and the processor must start fetching from the correct path. ++ 0x80 extra: remove_not_taken_jcc REMOVE_NOT_TAKEN_JCC counts the number of mispredicted branch instructions retired but removes taken conditional branches (JCC). This event counts the number of retired branch instructions that were mispredicted by the processor, categorized by type. A branch misprediction occurs when the processor predicts that the branch would be taken, but it is not, or vice-versa. When the misprediction is discovered, all the instructions executed in the wrong (speculative) path must be discarded, and the processor must start fetching from the correct path. ++name:no_alloc_cycles type:exclusive default:0x3f ++ 0x3f extra:inv all Counts the number of cycles that uops are allocated (inverse of NO_ALLOC_CYCLES.ALL) ++ 0x2 extra: sd_buffer_full Counts the number of cycles when no uops are allocated and the store data buffer is full. ++ 0x4 extra: mispredicts Counts the number of cycles when no uops are allocated and the alloc pipe is stalled waiting for a mispredicted jump to retire. After the misprediction is detected, the front end will start immediately but the allocate pipe stalls until the mispredicted ++ 0x8 extra: scoreboard Counts the number of cycles when no uops are allocated and a microcode IQ-based scoreboard stall is active. This includes stalls due to both the retirement scoreboard (at-ret) and micro-Jcc execution scoreboard (at-jeu). Does not count cycles when the MS ++ 0x10 extra: iq_empty Counts the number of cycles when no uops are allocated and the IQ is empty. Will assert immediately after a mispredict and partially overlap with MISPREDICTS sub event. ++name:rs_full_stall type:exclusive default:0x2 ++ 0x2 extra: iec_port0 Counts the number of cycles the Alloc pipeline is stalled because IEC RS for port 0 is full. ++ 0x4 extra: iec_port1 Counts the number of cycles the Alloc pipeline is stalled because IEC RS for port 1 is full. ++ 0x8 extra: fpc_port0 Counts the number of cycles the Alloc pipeline is stalled because FPC RS for port 0 is full. ++ 0x10 extra: fpc_port1 Counts the number of cycles the Alloc pipeline is stalled because FPC RS for port 1 is full. ++name:rs_dispatch_stall type:exclusive default:0x1 ++ 0x1 extra: iec0_rs *COUNTER BROKEN - NO FIX* Counts cycles when no uops were disptached from port 0 of IEC RS while the RS had valid ops left to dispatch ++ 0x2 extra: iec1_rs *COUNTER BROKEN - NO FIX* Counts cycles when no uops were disptached from port 1 of IEC RS while the RS had valid ops left to dispatch ++ 0x4 extra: fpc0_rs Counts cycles when no uops were disptached from port 0 of FPC RS while the RS had valid ops left to dispatch ++ 0x8 extra: fpc1_rs Counts cycles when no uops were disptached from port 1 of FPC RS while the RS had valid ops left to dispatch ++ 0x10 extra: mec_rs Counts cycles when no uops were dispatched from the MEC RS or rehab queue while valid ops were left to dispatch ++name:baclears type:exclusive default:0x2 ++ 0x2 extra: indirect Counts the number indirect branch baclears ++ 0x4 extra: uncond Counts the number unconditional branch baclears ++ 0x1e extra: no_corner_case sum of submasks [4:1]. Does not count special case baclears due to things like parity errors, bogus branches, and pd$ issues. ++name:decode_restriction type:exclusive default:0x1 ++ 0x1 extra: pdcache_wrong Counts the number of times a decode restriction reduced the decode throughput due to wrong instruction length prediction ++ 0x2 extra: all_3cycle_resteers Counts the number of times a decode restriction reduced the decode throughput because of all 3 cycle resteer conditions. Mainly PDCACHE_WRONG and MS_ENTRY cases. +diff --git a/libop/op_cpu_type.c b/libop/op_cpu_type.c +index badb7ba..4bb34b7 100644 +--- a/libop/op_cpu_type.c ++++ b/libop/op_cpu_type.c +@@ -127,6 +127,7 @@ static struct cpu_descr const cpu_descrs[MAX_CPU_TYPE] = { + { "AMD64 generic", "x86-64/generic", CPU_AMD64_GENERIC, 4 }, + { "IBM Power Architected Events V1", "ppc64/architected_events_v1", CPU_PPC64_ARCH_V1, 6 }, + { "ppc64 POWER8", "ppc64/power8", CPU_PPC64_POWER8, 6 }, ++ { "Intel Silvermont microarchitecture", "i386/silvermont", CPU_SILVERMONT, 2 }, + }; + + static size_t const nr_cpu_descrs = sizeof(cpu_descrs) / sizeof(struct cpu_descr); +@@ -644,6 +645,7 @@ op_cpu op_cpu_base_type(op_cpu cpu_type) + case CPU_ATOM: + case CPU_NEHALEM: + case CPU_HASWELL: ++ case CPU_SILVERMONT: + case CPU_WESTMERE: + case CPU_SANDYBRIDGE: + case CPU_IVYBRIDGE: +diff --git a/libop/op_cpu_type.h b/libop/op_cpu_type.h +index 934fe9e..4703fa9 100644 +--- a/libop/op_cpu_type.h ++++ b/libop/op_cpu_type.h +@@ -107,6 +107,7 @@ typedef enum { + CPU_AMD64_GENERIC, /**< AMD64 Generic */ + CPU_PPC64_ARCH_V1, /** < IBM Power architected events version 1 */ + CPU_PPC64_POWER8, /**< ppc64 POWER8 family */ ++ CPU_SILVERMONT, /** < Intel Silvermont microarchitecture */ + MAX_CPU_TYPE + } op_cpu; + +diff --git a/libop/op_events.c b/libop/op_events.c +index 9d2aa5e..39c710d 100644 +--- a/libop/op_events.c ++++ b/libop/op_events.c +@@ -1201,6 +1201,7 @@ void op_default_event(op_cpu cpu_type, struct op_default_event_descr * descr) + case CPU_CORE_I7: + case CPU_NEHALEM: + case CPU_HASWELL: ++ case CPU_SILVERMONT: + case CPU_WESTMERE: + case CPU_SANDYBRIDGE: + case CPU_IVYBRIDGE: +diff --git a/libop/op_hw_specific.h b/libop/op_hw_specific.h +index 6ae19bc..e86dcae 100644 +--- a/libop/op_hw_specific.h ++++ b/libop/op_hw_specific.h +@@ -150,6 +150,9 @@ static inline op_cpu op_cpu_specific_type(op_cpu cpu_type) + case 0x46: + case 0x47: + return CPU_HASWELL; ++ case 0x37: ++ case 0x4d: ++ return CPU_SILVERMONT; + } + } + return cpu_type; +diff --git a/utils/ophelp.c b/utils/ophelp.c +index 3b2896a..7543c6f 100644 +--- a/utils/ophelp.c ++++ b/utils/ophelp.c +@@ -551,19 +551,20 @@ int main(int argc, char const * argv[]) + case CPU_CORE_I7: + case CPU_NEHALEM: + case CPU_HASWELL: ++ case CPU_SILVERMONT: + case CPU_WESTMERE: + case CPU_SANDYBRIDGE: + case CPU_IVYBRIDGE: + case CPU_ATOM: + event_doc = + "See Intel Architecture Developer's Manual Volume 3B, Appendix A and\n" +- "Intel Architecture Optimization Reference Manual (730795-001)\n\n"; ++ "Intel Architecture Optimization Reference Manual\n\n"; + break; + + case CPU_ARCH_PERFMON: + event_doc = + "See Intel 64 and IA-32 Architectures Software Developer's Manual\n" +- "Volume 3B (Document 253669) Chapter 18 for architectural perfmon events\n" ++ "Volume 3B Chapter 18 for architectural perfmon events\n" + "This is a limited set of fallback events because oprofile doesn't know your CPU\n"; + break; + +commit 88779857662560604f85db608cf90f8609e1da6f +Author: Andi Kleen +Date: Thu Sep 11 09:00:52 2014 -0500 + + Update the Silvermont event files + + On further review the silvermont event files had a lot of problems. + I regenerated them completely. This fixes the PEBS events, and + fixes a range of others. + + The test suite passes without problems. + + Signed-off-by: Andi Kleen + +diff --git a/events/i386/silvermont/events b/events/i386/silvermont/events +index 077cc0a..434538f 100644 +--- a/events/i386/silvermont/events ++++ b/events/i386/silvermont/events +@@ -7,20 +7,18 @@ + # lowered in many cases without ill effect. + # + include:i386/arch_perfmon +-event:0x32 counters:0,1 um:l2_prefetcher_throttle minimum:200003 name:l2_prefetcher_throttle : +-event:0x3e counters:0,1 um:one minimum:200003 name:l2_prefetcher_pref_stream_alloc : +-event:0x50 counters:0,1 um:zero minimum:200003 name:l2_prefetch_pend_streams_pref_stream_pend_set : +-event:0x86 counters:0,1 um:nip_stall minimum:200003 name:nip_stall : +-event:0x87 counters:0,1 um:decode_stall minimum:200003 name:decode_stall : +-event:0x96 counters:0,1 um:uip_match minimum:200003 name:uip_match : ++event:0x03 counters:0,1 um:rehabq minimum:200003 name:rehabq : ++event:0x04 counters:0,1 um:mem_uops_retired minimum:200003 name:mem_uops_retired : ++event:0x05 counters:0,1 um:page_walks minimum:200003 name:page_walks : ++event:0x30 counters:0,1 um:zero minimum:200003 name:l2_reject_xq_all : ++event:0x31 counters:0,1 um:zero minimum:200003 name:core_reject_l2q_all : ++event:0x80 counters:0,1 um:icache minimum:200003 name:icache : + event:0xc2 counters:0,1 um:uops_retired minimum:2000003 name:uops_retired : +-event:0xc3 counters:0,1 um:x10 minimum:200003 name:machine_clears_live_lock_breaker : +-event:0xc4 counters:0,1 um:br_inst_retired minimum:2000003 name:br_inst_retired : ++event:0xc3 counters:0,1 um:machine_clears minimum:200003 name:machine_clears : ++event:0xc4 counters:0,1 um:br_inst_retired minimum:200003 name:br_inst_retired : + event:0xc5 counters:0,1 um:br_misp_retired minimum:200003 name:br_misp_retired : + event:0xca counters:0,1 um:no_alloc_cycles minimum:200003 name:no_alloc_cycles : + event:0xcb counters:0,1 um:rs_full_stall minimum:200003 name:rs_full_stall : +-event:0xcc counters:0,1 um:rs_dispatch_stall minimum:200003 name:rs_dispatch_stall : +-event:0xe6 counters:0,1 um:baclears minimum:2000003 name:baclears : +-event:0xe7 counters:0,1 um:x02 minimum:200003 name:ms_decoded_early_exit : +-event:0xe8 counters:0,1 um:one minimum:200003 name:btclears_all : +-event:0xe9 counters:0,1 um:decode_restriction minimum:200003 name:decode_restriction : ++event:0xcd counters:0,1 um:one minimum:2000003 name:cycles_div_busy_all : ++event:0xe6 counters:0,1 um:baclears minimum:200003 name:baclears : ++event:0xe7 counters:0,1 um:one minimum:200003 name:ms_decoded_ms_entry : +diff --git a/events/i386/silvermont/unit_masks b/events/i386/silvermont/unit_masks +index 6309282..c0dac26 100644 +--- a/events/i386/silvermont/unit_masks ++++ b/events/i386/silvermont/unit_masks +@@ -4,68 +4,86 @@ + # See http://ark.intel.com/ for help in identifying Silvermont based CPUs + # + include:i386/arch_perfmon +-name:x02 type:mandatory default:0x2 +- 0x2 No unit mask +-name:x10 type:mandatory default:0x10 +- 0x10 No unit mask +-name:l2_prefetcher_throttle type:exclusive default:0x2 +- 0x2 extra:edge conservative Counts the number of cycles the L2 prefetcher spends in throttling mode +- 0x1 extra:edge aggressive Counts the number of cycles the L2 prefetcher spends in throttling mode +-name:nip_stall type:exclusive default:0x3f +- 0x3f extra: all Counts the number of cycles the NIP stalls. +- 0x1 extra: pfb_full Counts the number of cycles the NIP stalls and the PFBs are full. This DOES NOT inlude PFB throttler cases. +- 0x2 extra: itlb_miss Counts the number of cycles the NIP stalls and there is an outstanding ITLB miss. This is a cummulative count of cycles the NIP stalled for all ITLB misses. +- 0x8 extra: pfb_throttler Counts the number of cycles the NIP stalls, the throttler is engaged, and the PFBs appear full. +- 0x10 extra: do_snoop Counts the number of cycles the NIP stalls because of a SMC compliance snoop to the MEC is required. +- 0x20 extra: misc_other Counts the number of cycles the NIP stalls due to NUKE, Stop Front End, Inserted flows. +- 0x1e extra: pfb_ready Counts the number of cycles the NIP stalls when the PFBs are not full and the decoders are able to process bytes. Does not count PFB_FULL nor MISC_OTHER stall cycles. +-name:decode_stall type:exclusive default:0x1 +- 0x1 extra: pfb_empty Counts the number of cycles decoder is stalled because the PFB is empty, this count is useful to see if the decoder is receiving the bytes from the front end. This event together with the DECODE_STALL.IQ_FULL may be used to narrow down on the bottleneck. +- 0x2 extra: iq_full Counts the number of cycles decoder is stalled because the IQ is full, this count is useful to see if the decoder is delivering the decoded uops. This event together with the DECODE_STALL.PFB_EMPTY may be used to narrow down on the bottleneck. +-name:uip_match type:exclusive default:0x1 +- 0x1 extra: first_uip This event is used for counting the number of times a specific micro IP address was decoded +- 0x2 extra: second_uip This event is used for counting the number of times a specific micro IP address was decoded +-name:uops_retired type:exclusive default:0x2 +- 0x2 extra: x87 This event counts the number of micro-ops retired that used X87 hardware. +- 0x4 extra: mul This event counts the number of micro-ops retired that used MUL hardware. +- 0x8 extra: div This event counts the number of micro-ops retired that used DIV hardware. +- 0x1 extra: ms_cyles Counts the number of uops that are from the complex flows issued by the micro-sequencer (MS). This includes uops from flows due to faults, assists, and inserted flows. +-name:br_inst_retired type:exclusive default:0x1 +- 0x1 extra: remove_jcc REMOVE_JCC counts the number of branch instructions retired but removes taken and not taken conditional branches (JCC). Branch prediction predicts the branch target and enables the processor to begin executing instructions long before the branch true execution path is known. All branches utilize the branch prediction unit (BPU) for prediction. This unit predicts the target address not only based on the EIP of the branch but also based on the execution path through which execution reached this EIP. The BPU can efficiently predict the following branch types: conditional branches, direct calls and jumps, indirect calls and jumps, returns. +- 0x2 extra: remove_rel_call REMOVE_REL_CALL counts the number of branch instructions retired but removes near relative CALL. Branch prediction predicts the branch target and enables the processor to begin executing instructions long before the branch true execution path is known. All branches utilize the branch prediction unit (BPU) for prediction. This unit predicts the target address not only based on the EIP of the branch but also based on the execution path through which execution reached this EIP. The BPU can efficiently predict the following branch types: conditional branches, direct calls and jumps, indirect calls and jumps, returns. +- 0x4 extra: remove_ind_call REMOVE_IND_CALL counts the number of branch instructions retired but removes near indirect CALL. Branch prediction predicts the branch target and enables the processor to begin executing instructions long before the branch true execution path is known. All branches utilize the branch prediction unit (BPU) for prediction. This unit predicts the target address not only based on the EIP of the branch but also based on the execution path through which execution reached this EIP. The BPU can efficiently predict the following branch types: conditional branches, direct calls and jumps, indirect calls and jumps, returns. +- 0x8 extra: remove_ret REMOVE_RET counts the number of branch instructions retired but removes near RET. Branch prediction predicts the branch target and enables the processor to begin executing instructions long before the branch true execution path is known. All branches utilize the branch prediction unit (BPU) for prediction. This unit predicts the target address not only based on the EIP of the branch but also based on the execution path through which execution reached this EIP. The BPU can efficiently predict the following branch types: conditional branches, direct calls and jumps, indirect calls and jumps, returns. +- 0x10 extra: remove_ind_jmp REMOVE_IND_JMP counts the number of branch instructions retired but removes near indirect JMP. Branch prediction predicts the branch target and enables the processor to begin executing instructions long before the branch true execution path is known. All branches utilize the branch prediction unit (BPU) for prediction. This unit predicts the target address not only based on the EIP of the branch but also based on the execution path through which execution reached this EIP. The BPU can efficiently predict the following branch types: conditional branches, direct calls and jumps, indirect calls and jumps, returns. +- 0x20 extra: remove_rel_jmp REMOVE_REL_JMP counts the number of branch instructions retired but removes near relative JMP. Branch prediction predicts the branch target and enables the processor to begin executing instructions long before the branch true execution path is known. All branches utilize the branch prediction unit (BPU) for prediction. This unit predicts the target address not only based on the EIP of the branch but also based on the execution path through which execution reached this EIP. The BPU can efficiently predict the following branch types: conditional branches, direct calls and jumps, indirect calls and jumps, returns. +- 0x40 extra: remove_far REMOVE_FAR counts the number of branch instructions retired but removes all far branches. Branch prediction predicts the branch target and enables the processor to begin executing instructions long before the branch true execution path is known. All branches utilize the branch prediction unit (BPU) for prediction. This unit predicts the target address not only based on the EIP of the branch but also based on the execution path through which execution reached this EIP. The BPU can efficiently predict the following branch types: conditional branches, direct calls and jumps, indirect calls and jumps, returns. +- 0x80 extra: remove_not_taken_jcc REMOVE_NOT_TAKEN_JCC counts the number of branch instructions retired but removes taken conditional branches (JCC). Branch prediction predicts the branch target and enables the processor to begin executing instructions long before the branch true execution path is known. All branches utilize the branch prediction unit (BPU) for prediction. This unit predicts the target address not only based on the EIP of the branch but also based on the execution path through which execution reached this EIP. The BPU can efficiently predict the following branch types: conditional branches, direct calls and jumps, indirect calls and jumps, returns. +-name:br_misp_retired type:exclusive default:0x1 +- 0x1 extra: remove_jcc REMOVE_JCC counts the number of mispredicted branch instructions retired but removes taken and not taken conditional branches (JCC). This event counts the number of retired branch instructions that were mispredicted by the processor, categorized by type. A branch misprediction occurs when the processor predicts that the branch would be taken, but it is not, or vice-versa. When the misprediction is discovered, all the instructions executed in the wrong (speculative) path must be discarded, and the processor must start fetching from the correct path. +- 0x4 extra: remove_ind_call REMOVE_IND_CALL Counts the number of mispredicted branch instructions retired but removes near indirect CALL. This event counts the number of retired branch instructions that were mispredicted by the processor, categorized by type. A branch misprediction occurs when the processor predicts that the branch would be taken, but it is not, or vice-versa. When the misprediction is discovered, all the instructions executed in the wrong (speculative) path must be discarded, and the processor must start fetching from the correct path. +- 0x8 extra: remove_ret REMOVE_RET Counts the number of mispredicted branch instructions retired but removes near RET. This event counts the number of retired branch instructions that were mispredicted by the processor, categorized by type. A branch misprediction occurs when the processor predicts that the branch would be taken, but it is not, or vice-versa. When the misprediction is discovered, all the instructions executed in the wrong (speculative) path must be discarded, and the processor must start fetching from the correct path. +- 0x10 extra: remove_ind_jmp REMOVE_IND_JMP counts the number of mispredicted branch instructions retired but removes near indirect JMP. This event counts the number of retired branch instructions that were mispredicted by the processor, categorized by type. A branch misprediction occurs when the processor predicts that the branch would be taken, but it is not, or vice-versa. When the misprediction is discovered, all the instructions executed in the wrong (speculative) path must be discarded, and the processor must start fetching from the correct path. +- 0x80 extra: remove_not_taken_jcc REMOVE_NOT_TAKEN_JCC counts the number of mispredicted branch instructions retired but removes taken conditional branches (JCC). This event counts the number of retired branch instructions that were mispredicted by the processor, categorized by type. A branch misprediction occurs when the processor predicts that the branch would be taken, but it is not, or vice-versa. When the misprediction is discovered, all the instructions executed in the wrong (speculative) path must be discarded, and the processor must start fetching from the correct path. ++name:rehabq type:exclusive default:0x1 ++ 0x1 extra: ld_block_st_forward This event counts the number of retired loads that were prohibited from receiving forwarded data from the store because of address mismatch. ++ 0x1 extra:pebs ld_block_st_forward_pebs This event counts the number of retired loads that were prohibited from receiving forwarded data from the store because of address mismatch. ++ 0x2 extra: ld_block_std_notready This event counts the cases where a forward was technically possible, but did not occur because the store data was not available at the right time ++ 0x4 extra: st_splits This event counts the number of retire stores that experienced cache line boundary splits ++ 0x8 extra: ld_splits This event counts the number of retire loads that experienced cache line boundary splits ++ 0x8 extra:pebs ld_splits_pebs This event counts the number of retire loads that experienced cache line boundary splits ++ 0x10 extra: lock This event counts the number of retired memory operations with lock semantics. These are either implicit locked instructions such as the XCHG instruction or instructions with an explicit LOCK prefix (0xF0). ++ 0x20 extra: sta_full This event counts the number of retired stores that are delayed because there is not a store address buffer available. ++ 0x40 extra: any_ld This event counts the number of load uops reissued from Rehabq ++ 0x80 extra: any_st This event counts the number of store uops reissued from Rehabq ++name:mem_uops_retired type:exclusive default:0x1 ++ 0x1 extra: l1_miss_loads This event counts the number of load ops retired that miss in L1 Data cache. Note that prefetch misses will not be counted. ++ 0x2 extra: l2_hit_loads This event counts the number of load ops retired that hit in the L2 ++ 0x2 extra:pebs l2_hit_loads_pebs This event counts the number of load ops retired that hit in the L2 ++ 0x4 extra: l2_miss_loads This event counts the number of load ops retired that miss in the L2 ++ 0x4 extra:pebs l2_miss_loads_pebs This event counts the number of load ops retired that miss in the L2 ++ 0x8 extra: dtlb_miss_loads This event counts the number of load ops retired that had DTLB miss. ++ 0x8 extra:pebs dtlb_miss_loads_pebs This event counts the number of load ops retired that had DTLB miss. ++ 0x10 extra: utlb_miss This event counts the number of load ops retired that had UTLB miss. ++ 0x20 extra: hitm This event counts the number of load ops retired that got data from the other core or from the other module. ++ 0x20 extra:pebs hitm_pebs This event counts the number of load ops retired that got data from the other core or from the other module. ++ 0x40 extra: all_loads This event counts the number of load ops retired ++ 0x80 extra: all_stores This event counts the number of store ops retired ++name:page_walks type:exclusive default:0x1 ++ 0x1 extra:edge d_side_walks This event counts when a data (D) page walk is completed or started. Since a page walk implies a TLB miss, the number of TLB misses can be counted by counting the number of pagewalks. ++ 0x1 extra: d_side_cycles This event counts every cycle when a D-side (walks due to a load) page walk is in progress. Page walk duration divided by number of page walks is the average duration of page-walks. ++ 0x2 extra:edge i_side_walks This event counts when an instruction (I) page walk is completed or started. Since a page walk implies a TLB miss, the number of TLB misses can be counted by counting the number of pagewalks. ++ 0x2 extra: i_side_cycles This event counts every cycle when a I-side (walks due to an instruction fetch) page walk is in progress. Page walk duration divided by number of page walks is the average duration of page-walks. ++ 0x3 extra:edge walks This event counts when a data (D) page walk or an instruction (I) page walk is completed or started. Since a page walk implies a TLB miss, the number of TLB misses can be counted by counting the number of pagewalks. ++ 0x3 extra: cycles This event counts every cycle when a data (D) page walk or instruction (I) page walk is in progress. Since a pagewalk implies a TLB miss, the approximate cost of a TLB miss can be determined from this event. ++name:icache type:exclusive default:0x3 ++ 0x3 extra: accesses This event counts all instruction fetches, including uncacheable fetches. ++ 0x1 extra: hit This event counts all instruction fetches from the instruction cache. ++ 0x2 extra: misses This event counts all instruction fetches that miss the Instruction cache or produce memory requests. This includes uncacheable fetches. An instruction fetch miss is counted only once and not once for every cycle it is outstanding. ++name:uops_retired type:exclusive default:0x10 ++ 0x10 extra: all This event counts the number of micro-ops retired. The processor decodes complex macro instructions into a sequence of simpler micro-ops. Most instructions are composed of one or two micro-ops. Some instructions are decoded into longer sequences such as repeat instructions, floating point transcendental instructions, and assists. In some cases micro-op sequences are fused or whole instructions are fused into one micro-op. See other UOPS_RETIRED events for differentiating retired fused and non-fused micro-ops. ++ 0x1 extra: ms This event counts the number of micro-ops retired that were supplied from MSROM. ++name:machine_clears type:exclusive default:0x8 ++ 0x8 extra: all Machine clears happen when something happens in the machine that causes the hardware to need to take special care to get the right answer. When such a condition is signaled on an instruction, the front end of the machine is notified that it must restart, so no more instructions will be decoded from the current path. All instructions "older" than this one will be allowed to finish. This instruction and all "younger" instructions must be cleared, since they must not be allowed to complete. Essentially, the hardware waits until the problematic instruction is the oldest instruction in the machine. This means all older instructions are retired, and all pending stores (from older instructions) are completed. Then the new path of instructions from the front end are allowed to start into the machine. There are many conditions that might cause a machine clear (including the receipt of an interrupt, or a trap or a fault). All those conditions (including but not limited to MACHINE_CLEARS.MEMORY_ORDERING, MACHINE_CLEARS.SMC, and MACHINE_CLEARS.FP_ASSIST) are captured in the ANY event. In addition, some conditions can be specifically counted (i.e. SMC, MEMORY_ORDERING, FP_ASSIST). However, the sum of SMC, MEMORY_ORDERING, and FP_ASSIST machine clears will not necessarily equal the number of ANY. ++ 0x1 extra: smc This event counts the number of times that a program writes to a code section. Self-modifying code causes a severe penalty in all Intel? architecture processors. ++ 0x2 extra: memory_ordering This event counts the number of times that pipeline was cleared due to memory ordering issues. ++ 0x4 extra: fp_assist This event counts the number of times that pipeline stalled due to FP operations needing assists. ++name:br_inst_retired type:exclusive default:0x7e ++ 0x7e extra: jcc JCC counts the number of conditional branch (JCC) instructions retired. Branch prediction predicts the branch target and enables the processor to begin executing instructions long before the branch true execution path is known. All branches utilize the branch prediction unit (BPU) for prediction. This unit predicts the target address not only based on the EIP of the branch but also based on the execution path through which execution reached this EIP. The BPU can efficiently predict the following branch types: conditional branches, direct calls and jumps, indirect calls and jumps, returns. ++ 0x7e extra:pebs jcc_pebs JCC counts the number of conditional branch (JCC) instructions retired. Branch prediction predicts the branch target and enables the processor to begin executing instructions long before the branch true execution path is known. All branches utilize the branch prediction unit (BPU) for prediction. This unit predicts the target address not only based on the EIP of the branch but also based on the execution path through which execution reached this EIP. The BPU can efficiently predict the following branch types: conditional branches, direct calls and jumps, indirect calls and jumps, returns. ++ 0xfe extra: taken_jcc TAKEN_JCC counts the number of taken conditional branch (JCC) instructions retired. Branch prediction predicts the branch target and enables the processor to begin executing instructions long before the branch true execution path is known. All branches utilize the branch prediction unit (BPU) for prediction. This unit predicts the target address not only based on the EIP of the branch but also based on the execution path through which execution reached this EIP. The BPU can efficiently predict the following branch types: conditional branches, direct calls and jumps, indirect calls and jumps, returns. ++ 0xfe extra:pebs taken_jcc_pebs TAKEN_JCC counts the number of taken conditional branch (JCC) instructions retired. Branch prediction predicts the branch target and enables the processor to begin executing instructions long before the branch true execution path is known. All branches utilize the branch prediction unit (BPU) for prediction. This unit predicts the target address not only based on the EIP of the branch but also based on the execution path through which execution reached this EIP. The BPU can efficiently predict the following branch types: conditional branches, direct calls and jumps, indirect calls and jumps, returns. ++ 0xf9 extra: call CALL counts the number of near CALL branch instructions retired. Branch prediction predicts the branch target and enables the processor to begin executing instructions long before the branch true execution path is known. All branches utilize the branch prediction unit (BPU) for prediction. This unit predicts the target address not only based on the EIP of the branch but also based on the execution path through which execution reached this EIP. The BPU can efficiently predict the following branch types: conditional branches, direct calls and jumps, indirect calls and jumps, returns. ++ 0xf9 extra:pebs call_pebs CALL counts the number of near CALL branch instructions retired. Branch prediction predicts the branch target and enables the processor to begin executing instructions long before the branch true execution path is known. All branches utilize the branch prediction unit (BPU) for prediction. This unit predicts the target address not only based on the EIP of the branch but also based on the execution path through which execution reached this EIP. The BPU can efficiently predict the following branch types: conditional branches, direct calls and jumps, indirect calls and jumps, returns. ++ 0xfd extra: rel_call REL_CALL counts the number of near relative CALL branch instructions retired. Branch prediction predicts the branch target and enables the processor to begin executing instructions long before the branch true execution path is known. All branches utilize the branch prediction unit (BPU) for prediction. This unit predicts the target address not only based on the EIP of the branch but also based on the execution path through which execution reached this EIP. The BPU can efficiently predict the following branch types: conditional branches, direct calls and jumps, indirect calls and jumps, returns. ++ 0xfd extra:pebs rel_call_pebs REL_CALL counts the number of near relative CALL branch instructions retired. Branch prediction predicts the branch target and enables the processor to begin executing instructions long before the branch true execution path is known. All branches utilize the branch prediction unit (BPU) for prediction. This unit predicts the target address not only based on the EIP of the branch but also based on the execution path through which execution reached this EIP. The BPU can efficiently predict the following branch types: conditional branches, direct calls and jumps, indirect calls and jumps, returns. ++ 0xfb extra: ind_call IND_CALL counts the number of near indirect CALL branch instructions retired. Branch prediction predicts the branch target and enables the processor to begin executing instructions long before the branch true execution path is known. All branches utilize the branch prediction unit (BPU) for prediction. This unit predicts the target address not only based on the EIP of the branch but also based on the execution path through which execution reached this EIP. The BPU can efficiently predict the following branch types: conditional branches, direct calls and jumps, indirect calls and jumps, returns. ++ 0xfb extra:pebs ind_call_pebs IND_CALL counts the number of near indirect CALL branch instructions retired. Branch prediction predicts the branch target and enables the processor to begin executing instructions long before the branch true execution path is known. All branches utilize the branch prediction unit (BPU) for prediction. This unit predicts the target address not only based on the EIP of the branch but also based on the execution path through which execution reached this EIP. The BPU can efficiently predict the following branch types: conditional branches, direct calls and jumps, indirect calls and jumps, returns. ++ 0xf7 extra: return RETURN counts the number of near RET branch instructions retired. Branch prediction predicts the branch target and enables the processor to begin executing instructions long before the branch true execution path is known. All branches utilize the branch prediction unit (BPU) for prediction. This unit predicts the target address not only based on the EIP of the branch but also based on the execution path through which execution reached this EIP. The BPU can efficiently predict the following branch types: conditional branches, direct calls and jumps, indirect calls and jumps, returns. ++ 0xf7 extra:pebs return_pebs RETURN counts the number of near RET branch instructions retired. Branch prediction predicts the branch target and enables the processor to begin executing instructions long before the branch true execution path is known. All branches utilize the branch prediction unit (BPU) for prediction. This unit predicts the target address not only based on the EIP of the branch but also based on the execution path through which execution reached this EIP. The BPU can efficiently predict the following branch types: conditional branches, direct calls and jumps, indirect calls and jumps, returns. ++ 0xeb extra: non_return_ind NON_RETURN_IND counts the number of near indirect JMP and near indirect CALL branch instructions retired. Branch prediction predicts the branch target and enables the processor to begin executing instructions long before the branch true execution path is known. All branches utilize the branch prediction unit (BPU) for prediction. This unit predicts the target address not only based on the EIP of the branch but also based on the execution path through which execution reached this EIP. The BPU can efficiently predict the following branch types: conditional branches, direct calls and jumps, indirect calls and jumps, returns. ++ 0xeb extra:pebs non_return_ind_pebs NON_RETURN_IND counts the number of near indirect JMP and near indirect CALL branch instructions retired. Branch prediction predicts the branch target and enables the processor to begin executing instructions long before the branch true execution path is known. All branches utilize the branch prediction unit (BPU) for prediction. This unit predicts the target address not only based on the EIP of the branch but also based on the execution path through which execution reached this EIP. The BPU can efficiently predict the following branch types: conditional branches, direct calls and jumps, indirect calls and jumps, returns. ++ 0xbf extra: far_branch FAR counts the number of far branch instructions retired. Branch prediction predicts the branch target and enables the processor to begin executing instructions long before the branch true execution path is known. All branches utilize the branch prediction unit (BPU) for prediction. This unit predicts the target address not only based on the EIP of the branch but also based on the execution path through which execution reached this EIP. The BPU can efficiently predict the following branch types: conditional branches, direct calls and jumps, indirect calls and jumps, returns. ++ 0xbf extra:pebs far_branch_pebs FAR counts the number of far branch instructions retired. Branch prediction predicts the branch target and enables the processor to begin executing instructions long before the branch true execution path is known. All branches utilize the branch prediction unit (BPU) for prediction. This unit predicts the target address not only based on the EIP of the branch but also based on the execution path through which execution reached this EIP. The BPU can efficiently predict the following branch types: conditional branches, direct calls and jumps, indirect calls and jumps, returns. ++name:br_misp_retired type:exclusive default:0x7e ++ 0x7e extra: jcc JCC counts the number of mispredicted conditional branches (JCC) instructions retired. This event counts the number of retired branch instructions that were mispredicted by the processor, categorized by type. A branch misprediction occurs when the processor predicts that the branch would be taken, but it is not, or vice-versa. When the misprediction is discovered, all the instructions executed in the wrong (speculative) path must be discarded, and the processor must start fetching from the correct path. ++ 0x7e extra:pebs jcc_pebs JCC counts the number of mispredicted conditional branches (JCC) instructions retired. This event counts the number of retired branch instructions that were mispredicted by the processor, categorized by type. A branch misprediction occurs when the processor predicts that the branch would be taken, but it is not, or vice-versa. When the misprediction is discovered, all the instructions executed in the wrong (speculative) path must be discarded, and the processor must start fetching from the correct path. ++ 0xfe extra: taken_jcc TAKEN_JCC counts the number of mispredicted taken conditional branch (JCC) instructions retired. This event counts the number of retired branch instructions that were mispredicted by the processor, categorized by type. A branch misprediction occurs when the processor predicts that the branch would be taken, but it is not, or vice-versa. When the misprediction is discovered, all the instructions executed in the wrong (speculative) path must be discarded, and the processor must start fetching from the correct path. ++ 0xfe extra:pebs taken_jcc_pebs TAKEN_JCC counts the number of mispredicted taken conditional branch (JCC) instructions retired. This event counts the number of retired branch instructions that were mispredicted by the processor, categorized by type. A branch misprediction occurs when the processor predicts that the branch would be taken, but it is not, or vice-versa. When the misprediction is discovered, all the instructions executed in the wrong (speculative) path must be discarded, and the processor must start fetching from the correct path. ++ 0xfb extra: ind_call IND_CALL counts the number of mispredicted near indirect CALL branch instructions retired. This event counts the number of retired branch instructions that were mispredicted by the processor, categorized by type. A branch misprediction occurs when the processor predicts that the branch would be taken, but it is not, or vice-versa. When the misprediction is discovered, all the instructions executed in the wrong (speculative) path must be discarded, and the processor must start fetching from the correct path. ++ 0xfb extra:pebs ind_call_pebs IND_CALL counts the number of mispredicted near indirect CALL branch instructions retired. This event counts the number of retired branch instructions that were mispredicted by the processor, categorized by type. A branch misprediction occurs when the processor predicts that the branch would be taken, but it is not, or vice-versa. When the misprediction is discovered, all the instructions executed in the wrong (speculative) path must be discarded, and the processor must start fetching from the correct path. ++ 0xf7 extra: return RETURN counts the number of mispredicted near RET branch instructions retired. This event counts the number of retired branch instructions that were mispredicted by the processor, categorized by type. A branch misprediction occurs when the processor predicts that the branch would be taken, but it is not, or vice-versa. When the misprediction is discovered, all the instructions executed in the wrong (speculative) path must be discarded, and the processor must start fetching from the correct path. ++ 0xf7 extra:pebs return_pebs RETURN counts the number of mispredicted near RET branch instructions retired. This event counts the number of retired branch instructions that were mispredicted by the processor, categorized by type. A branch misprediction occurs when the processor predicts that the branch would be taken, but it is not, or vice-versa. When the misprediction is discovered, all the instructions executed in the wrong (speculative) path must be discarded, and the processor must start fetching from the correct path. ++ 0xeb extra: non_return_ind NON_RETURN_IND counts the number of mispredicted near indirect JMP and near indirect CALL branch instructions retired. This event counts the number of retired branch instructions that were mispredicted by the processor, categorized by type. A branch misprediction occurs when the processor predicts that the branch would be taken, but it is not, or vice-versa. When the misprediction is discovered, all the instructions executed in the wrong (speculative) path must be discarded, and the processor must start fetching from the correct path. ++ 0xeb extra:pebs non_return_ind_pebs NON_RETURN_IND counts the number of mispredicted near indirect JMP and near indirect CALL branch instructions retired. This event counts the number of retired branch instructions that were mispredicted by the processor, categorized by type. A branch misprediction occurs when the processor predicts that the branch would be taken, but it is not, or vice-versa. When the misprediction is discovered, all the instructions executed in the wrong (speculative) path must be discarded, and the processor must start fetching from the correct path. + name:no_alloc_cycles type:exclusive default:0x3f +- 0x3f extra:inv all Counts the number of cycles that uops are allocated (inverse of NO_ALLOC_CYCLES.ALL) +- 0x2 extra: sd_buffer_full Counts the number of cycles when no uops are allocated and the store data buffer is full. +- 0x4 extra: mispredicts Counts the number of cycles when no uops are allocated and the alloc pipe is stalled waiting for a mispredicted jump to retire. After the misprediction is detected, the front end will start immediately but the allocate pipe stalls until the mispredicted +- 0x8 extra: scoreboard Counts the number of cycles when no uops are allocated and a microcode IQ-based scoreboard stall is active. This includes stalls due to both the retirement scoreboard (at-ret) and micro-Jcc execution scoreboard (at-jeu). Does not count cycles when the MS +- 0x10 extra: iq_empty Counts the number of cycles when no uops are allocated and the IQ is empty. Will assert immediately after a mispredict and partially overlap with MISPREDICTS sub event. +-name:rs_full_stall type:exclusive default:0x2 +- 0x2 extra: iec_port0 Counts the number of cycles the Alloc pipeline is stalled because IEC RS for port 0 is full. +- 0x4 extra: iec_port1 Counts the number of cycles the Alloc pipeline is stalled because IEC RS for port 1 is full. +- 0x8 extra: fpc_port0 Counts the number of cycles the Alloc pipeline is stalled because FPC RS for port 0 is full. +- 0x10 extra: fpc_port1 Counts the number of cycles the Alloc pipeline is stalled because FPC RS for port 1 is full. +-name:rs_dispatch_stall type:exclusive default:0x1 +- 0x1 extra: iec0_rs *COUNTER BROKEN - NO FIX* Counts cycles when no uops were disptached from port 0 of IEC RS while the RS had valid ops left to dispatch +- 0x2 extra: iec1_rs *COUNTER BROKEN - NO FIX* Counts cycles when no uops were disptached from port 1 of IEC RS while the RS had valid ops left to dispatch +- 0x4 extra: fpc0_rs Counts cycles when no uops were disptached from port 0 of FPC RS while the RS had valid ops left to dispatch +- 0x8 extra: fpc1_rs Counts cycles when no uops were disptached from port 1 of FPC RS while the RS had valid ops left to dispatch +- 0x10 extra: mec_rs Counts cycles when no uops were dispatched from the MEC RS or rehab queue while valid ops were left to dispatch +-name:baclears type:exclusive default:0x2 +- 0x2 extra: indirect Counts the number indirect branch baclears +- 0x4 extra: uncond Counts the number unconditional branch baclears +- 0x1e extra: no_corner_case sum of submasks [4:1]. Does not count special case baclears due to things like parity errors, bogus branches, and pd$ issues. +-name:decode_restriction type:exclusive default:0x1 +- 0x1 extra: pdcache_wrong Counts the number of times a decode restriction reduced the decode throughput due to wrong instruction length prediction +- 0x2 extra: all_3cycle_resteers Counts the number of times a decode restriction reduced the decode throughput because of all 3 cycle resteer conditions. Mainly PDCACHE_WRONG and MS_ENTRY cases. ++ 0x3f extra: all The NO_ALLOC_CYCLES.ALL event counts the number of cycles when the front-end does not provide any instructions to be allocated for any reason. This event indicates the cycles where an allocation stalls occurs, and no UOPS are allocated in that cycle. ++ 0x1 extra: rob_full Counts the number of cycles when no uops are allocated and the ROB is full (less than 2 entries available) ++ 0x20 extra: rat_stall Counts the number of cycles when no uops are allocated and a RATstall is asserted. ++ 0x50 extra: not_delivered The NO_ALLOC_CYCLES.NOT_DELIVERED event is used to measure front-end inefficiencies, i.e. when front-end of the machine is not delivering micro-ops to the back-end and the back-end is not stalled. This event can be used to identify if the machine is truly front-end bound. When this event occurs, it is an indication that the front-end of the machine is operating at less than its theoretical peak performance. Background: We can think of the processor pipeline as being divided into 2 broader parts: Front-end and Back-end. Front-end is responsible for fetching the instruction, decoding into micro-ops (uops) in machine understandable format and putting them into a micro-op queue to be consumed by back end. The back-end then takes these micro-ops, allocates the required resources. When all resources are ready, micro-ops are executed. If the back-end is not ready to accept micro-ops from the front-end, then we do not want to count these as front-end bottlenecks. However, whenever we have bottlenecks in the back-end, we will have allocation unit stalls and eventually forcing the front-end to wait until the back-end is ready to receive more UOPS. This event counts the cycles only when back-end is requesting more uops and front-end is not able to provide them. Some examples of conditions that cause front-end efficiencies are: Icache misses, ITLB misses, and decoder restrictions that limit the the front-end bandwidth. ++name:rs_full_stall type:exclusive default:0x1f ++ 0x1f extra: all Counts the number of cycles the Alloc pipeline is stalled when any one of the RSs (IEC, FPC and MEC) is full. This event is a superset of all the individual RS stall event counts. ++ 0x1 extra: mec Counts the number of cycles and allocation pipeline is stalled and is waiting for a free MEC reservation station entry. The cycles should be appropriately counted in case of the cracked ops e.g. In case of a cracked load-op, the load portion is sent to M ++name:baclears type:exclusive default:0x1 ++ 0x1 extra: all The BACLEARS event counts the number of times the front end is resteered, mainly when the Branch Prediction Unit cannot provide a correct prediction and this is corrected by the Branch Address Calculator at the front end. The BACLEARS.ANY event counts the number of baclears for any type of branch. ++ 0x8 extra: return The BACLEARS event counts the number of times the front end is resteered, mainly when the Branch Prediction Unit cannot provide a correct prediction and this is corrected by the Branch Address Calculator at the front end. The BACLEARS.RETURN event counts the number of RETURN baclears. ++ 0x10 extra: cond The BACLEARS event counts the number of times the front end is resteered, mainly when the Branch Prediction Unit cannot provide a correct prediction and this is corrected by the Branch Address Calculator at the front end. The BACLEARS.COND event counts the number of JCC (Jump on Condtional Code) baclears. diff --git a/SOURCES/oprofile-skylake.patch b/SOURCES/oprofile-skylake.patch new file mode 100644 index 0000000..201b697 --- /dev/null +++ b/SOURCES/oprofile-skylake.patch @@ -0,0 +1,720 @@ +From 917dfab881becfad104ad02682a88afb54284932 Mon Sep 17 00:00:00 2001 +From: Andi Kleen +Date: Wed, 1 Jul 2015 14:36:42 -0700 +Subject: [PATCH 1/3] Add support for Intel Skylake events + +Add support for the Intel Skylake micro architecture to oprofile. + +OFFCORE_* and FRONTEND_* events are not supported for now because +oprofile does not support setting up config1 + +Signed-off-by: Andi Kleen +--- + events/Makefile.am | 1 + + events/i386/skylake/events | 62 ++++++++ + events/i386/skylake/unit_masks | 314 +++++++++++++++++++++++++++++++++++++++++ + libop/op_cpu_type.c | 2 + + libop/op_cpu_type.h | 1 + + libop/op_events.c | 1 + + libop/op_hw_specific.h | 3 + + utils/ophelp.c | 1 + + 8 files changed, 385 insertions(+) + create mode 100644 events/i386/skylake/events + create mode 100644 events/i386/skylake/unit_masks + +diff --git a/events/Makefile.am b/events/Makefile.am +index d68f0e8..56f9020 100644 +--- a/events/Makefile.am ++++ b/events/Makefile.am +@@ -18,6 +18,7 @@ event_files = \ + i386/ivybridge/events i386/ivybridge/unit_masks \ + i386/haswell/events i386/haswell/unit_masks \ + i386/broadwell/events i386/broadwell/unit_masks \ ++ i386/skylake/events i386/skylake/unit_masks \ + i386/silvermont/events i386/silvermont/unit_masks \ + ia64/ia64/events ia64/ia64/unit_masks \ + ia64/itanium2/events ia64/itanium2/unit_masks \ +diff --git a/events/i386/skylake/events b/events/i386/skylake/events +new file mode 100644 +index 0000000..28d6654 +--- /dev/null ++++ b/events/i386/skylake/events +@@ -0,0 +1,62 @@ ++# ++# Intel "Skylake" microarchitecture core events. ++# ++# See http://ark.intel.com/ for help in identifying Skylake based CPUs ++# ++# Note the minimum counts are not discovered experimentally and could be likely ++# lowered in many cases without ill effect. ++# ++event:0x00 counters:1 um:inst_retired minimum:2000003 name:inst_retired : ++event:0x00 counters:cpuid um:cpu_clk_unhalted minimum:2000003 name:cpu_clk_unhalted : ++event:0x03 counters:cpuid um:ld_blocks minimum:100003 name:ld_blocks : ++event:0x07 counters:cpuid um:ld_blocks_partial minimum:100003 name:ld_blocks_partial_address_alias : ++event:0x08 counters:cpuid um:dtlb_load_misses minimum:2000003 name:dtlb_load_misses : ++event:0x0d counters:cpuid um:int_misc minimum:2000003 name:int_misc : ++event:0x0e counters:cpuid um:uops_issued minimum:2000003 name:uops_issued : ++event:0x14 counters:cpuid um:arith minimum:2000003 name:arith_divider_active : ++event:0x24 counters:cpuid um:l2_rqsts minimum:200003 name:l2_rqsts : ++event:0x2e counters:cpuid um:longest_lat_cache minimum:100003 name:longest_lat_cache : ++event:0x3c counters:cpuid um:cpu_clk_thread_unhalted minimum:2000003 name:cpu_clk_thread_unhalted : ++event:0x48 counters:cpuid um:l1d_pend_miss minimum:2000003 name:l1d_pend_miss : ++event:0x49 counters:cpuid um:dtlb_store_misses minimum:2000003 name:dtlb_store_misses : ++event:0x4c counters:cpuid um:load_hit_pre minimum:100003 name:load_hit_pre_sw_pf : ++event:0x4f counters:cpuid um:ept minimum:2000003 name:ept_walk_pending : ++event:0x51 counters:cpuid um:l1d minimum:2000003 name:l1d_replacement : ++event:0x54 counters:cpuid um:tx_mem minimum:2000003 name:tx_mem : ++event:0x5d counters:cpuid um:tx_exec minimum:2000003 name:tx_exec : ++event:0x5e counters:cpuid um:rs_events minimum:2000003 name:rs_events : ++event:0x60 counters:cpuid um:offcore_requests_outstanding minimum:2000003 name:offcore_requests_outstanding : ++event:0x63 counters:cpuid um:lock_cycles minimum:2000003 name:lock_cycles_cache_lock_duration : ++event:0x79 counters:cpuid um:idq minimum:2000003 name:idq : ++event:0x80 counters:cpuid um:icache_16b minimum:2000003 name:icache_16b_ifdata_stall : ++event:0x83 counters:cpuid um:icache_64b minimum:200003 name:icache_64b : ++event:0x85 counters:cpuid um:itlb_misses minimum:100003 name:itlb_misses : ++event:0x87 counters:cpuid um:ild_stall minimum:2000003 name:ild_stall_lcp : ++event:0x9c counters:cpuid um:idq_uops_not_delivered minimum:2000003 name:idq_uops_not_delivered : ++event:0xa1 counters:cpuid um:uops_dispatched_port minimum:2000003 name:uops_dispatched_port : ++event:0xa2 counters:cpuid um:resource_stalls minimum:2000003 name:resource_stalls : ++event:0xa3 counters:cpuid um:cycle_activity minimum:2000003 name:cycle_activity : ++event:0xa6 counters:cpuid um:exe_activity minimum:2000003 name:exe_activity : ++event:0xa8 counters:cpuid um:lsd minimum:2000003 name:lsd : ++event:0xab counters:cpuid um:dsb2mite_switches minimum:2000003 name:dsb2mite_switches_penalty_cycles : ++event:0xae counters:cpuid um:itlb minimum:100007 name:itlb_itlb_flush : ++event:0xb0 counters:cpuid um:offcore_requests minimum:100003 name:offcore_requests : ++event:0xb1 counters:cpuid um:uops_executed minimum:2000003 name:uops_executed : ++event:0xb2 counters:cpuid um:offcore_requests_buffer minimum:2000003 name:offcore_requests_buffer_sq_full : ++event:0xbd counters:cpuid um:tlb_flush minimum:100007 name:tlb_flush : ++event:0xc1 counters:cpuid um:other_assists minimum:100003 name:other_assists_any : ++event:0xc2 counters:cpuid um:uops_retired minimum:2000003 name:uops_retired : ++event:0xc3 counters:cpuid um:machine_clears minimum:100003 name:machine_clears : ++event:0xc4 counters:cpuid um:br_inst_retired minimum:400009 name:br_inst_retired : ++event:0xc5 counters:cpuid um:br_misp_retired minimum:400009 name:br_misp_retired : ++event:0xc7 counters:cpuid um:fp_arith_inst_retired minimum:2000003 name:fp_arith_inst_retired : ++event:0xc8 counters:cpuid um:hle_retired minimum:2000003 name:hle_retired : ++event:0xc9 counters:cpuid um:rtm_retired minimum:2000003 name:rtm_retired : ++event:0xca counters:cpuid um:fp_assist minimum:100003 name:fp_assist_any : ++event:0xcb counters:cpuid um:hw_interrupts minimum:100003 name:hw_interrupts_received : ++event:0xd0 counters:0,1,2,3 um:mem_inst_retired minimum:2000003 name:mem_inst_retired : ++event:0xd1 counters:0,1,2,3 um:mem_load_retired minimum:2000003 name:mem_load_retired : ++event:0xd2 counters:0,1,2,3 um:mem_load_l3_hit_retired minimum:100003 name:mem_load_l3_hit_retired : ++event:0xe6 counters:cpuid um:baclears minimum:100003 name:baclears_any : ++event:0xf0 counters:cpuid um:l2_trans minimum:200003 name:l2_trans_l2_wb : ++event:0xf1 counters:cpuid um:l2_lines_in minimum:100003 name:l2_lines_in_all : +diff --git a/events/i386/skylake/unit_masks b/events/i386/skylake/unit_masks +new file mode 100644 +index 0000000..98ed65c +--- /dev/null ++++ b/events/i386/skylake/unit_masks +@@ -0,0 +1,314 @@ ++# ++# Unit masks for the Intel "Skylake" micro architecture ++# ++# See http://ark.intel.com/ for help in identifying Skylake based CPUs ++# ++name:arith type:mandatory default:0x1 ++ 0x1 extra:cmask=1,edge divider_active Cycles when divide unit is busy executing divide or square root operations. Accounts for integer and floating-point operations. ++name:baclears type:mandatory default:0x1 ++ 0x1 extra: any Counts the total number when the front end is resteered, mainly when the BPU cannot provide a correct prediction and this is corrected by other branch handling mechanisms at the front end. ++name:dsb2mite_switches type:mandatory default:0x2 ++ 0x2 extra: penalty_cycles Decode Stream Buffer (DSB)-to-MITE switch true penalty cycles. ++name:ept type:mandatory default:0x10 ++ 0x10 extra: walk_pending Counts 1 per cycle for each PMH that is busy with a EPT (Extended Page Table) walk for any request type. ++name:fp_assist type:mandatory default:0x1e ++ 0x1e extra:cmask=1 any Cycles with any input/output SSE or FP assist ++name:hw_interrupts type:mandatory default:0x1 ++ 0x1 extra: received Number of hardware interrupts received by the processor. ++name:icache_16b type:mandatory default:0x4 ++ 0x4 extra: ifdata_stall Cycles where a code fetch is stalled due to L1 instruction cache miss. ++name:ild_stall type:mandatory default:0x1 ++ 0x1 extra: lcp Stalls caused by changing prefix length of the instruction. ++name:itlb type:mandatory default:0x1 ++ 0x1 extra: itlb_flush Flushing of the Instruction TLB (ITLB) pages, includes 4k/2M/4M pages. ++name:l1d type:mandatory default:0x1 ++ 0x1 extra: replacement L1D data line replacements ++name:l2_lines_in type:mandatory default:0x7 ++ 0x7 extra: all L2 cache lines filling L2 ++name:l2_trans type:mandatory default:0x40 ++ 0x40 extra: l2_wb L2 writebacks that access L2 cache ++name:ld_blocks_partial type:mandatory default:0x1 ++ 0x1 extra: address_alias False dependencies in MOB due to partial compare on address. ++name:load_hit_pre type:mandatory default:0x1 ++ 0x1 extra: sw_pf Demand load dispatches that hit L1D fill buffer (FB) allocated for software prefetch. ++name:lock_cycles type:mandatory default:0x2 ++ 0x2 extra: cache_lock_duration Cycles when L1D is locked ++name:offcore_requests_buffer type:mandatory default:0x1 ++ 0x1 extra: sq_full Offcore requests buffer cannot take more entries for this thread core. ++name:other_assists type:mandatory default:0x3f ++ 0x3f extra: any Number of times a microcode assist is invoked by HW other than FP-assist. Examples include AD (page Access Dirty) and AVX* related assists. ++name:inst_retired type:exclusive default:any ++ 0x1 extra: any Instructions retired from execution.mem ++ 0x0 extra: any_p Number of instructions retired. General Counter - architectural event ++ 0x1 extra:pebs prec_dist Precise instruction retired event with HW to reduce effect of PEBS shadow in IP distribution ++name:cpu_clk_unhalted type:exclusive default:thread ++ 0x2 extra: thread Core cycles when the thread is not in halt state ++ 0x3 extra: ref_tsc Reference cycles when the core is not in halt state. ++ 0x0 extra: thread_p Thread cycles when thread is not in halt state ++ 0x2 extra:any thread_any Core cycles when at least one thread on the physical core is not in halt state ++ 0x0 extra:any thread_p_any Core cycles when at least one thread on the physical core is not in halt state ++name:ld_blocks type:exclusive default:0x2 ++ 0x2 extra: store_forward loads blocked by overlapping with store buffer that cannot be forwarded . ++ 0x8 extra: no_sr The number of times that split load operations are temporarily blocked because all resources for handling the split accesses are in use ++name:dtlb_load_misses type:exclusive default:miss_causes_a_walk ++ 0x1 extra: miss_causes_a_walk Load misses in all DTLB levels that cause page walks ++ 0x10 extra: walk_pending Counts 1 per cycle for each PMH that is busy with a page walk for a load. ++ 0x20 extra: stlb_hit Loads that miss the DTLB and hit the STLB. ++ 0xe extra: walk_completed Load miss in all TLB levels causes a page walk that completes. (All page sizes) ++ 0x10 extra:cmask=1 walk_active Cycles when at least one PMH is busy with a page walk for a load. ++name:int_misc type:exclusive default:recovery_cycles ++ 0x1 extra: recovery_cycles Core cycles the allocator was stalled due to recovery from earlier clear event for this thread (e.g. misprediction or memory nuke) ++ 0x80 extra: clear_resteer_cycles Cycles the issue-stage is waiting for front-end to fetch from resteered path following branch misprediction or machine clear events. ++ 0x1 extra:any recovery_cycles_any Core cycles the allocator was stalled due to recovery from earlier clear event for any thread running on the physical core (e.g. misprediction or memory nuke) ++name:uops_issued type:exclusive default:any ++ 0x1 extra: any Uops that Resource Allocation Table (RAT) issues to Reservation Station (RS) ++ 0x20 extra: slow_lea Number of slow LEA uops being allocated. A uop is generally considered SlowLea if it has 3 sources (e.g. 2 sources + immediate) regardless if as a result of LEA instruction or not. ++ 0x1 extra:cmask=1,inv stall_cycles Cycles when Resource Allocation Table (RAT) does not issue Uops to Reservation Station (RS) for the thread ++ 0x2 extra: vector_width_mismatch This event counts the number of Blend Uops issued by the Resource Allocation Table (RAT) to the reservation station (RS) in order to preserve upper bits of vector registers. Starting the Skylake microarchitecture, these Blend uops are needed since every Intel SSE instruction executed in Dirty Upper State needs to preserve bits 128-255 of the destination register. For more information, refer to ?Mixing Intel AVX and Intel SSE Code? section of the Optimization Guide. ++name:l2_rqsts type:exclusive default:0x21 ++ 0x21 extra: demand_data_rd_miss Demand Data Read miss L2, no rejects ++ 0x41 extra: demand_data_rd_hit Demand Data Read requests that hit L2 cache ++ 0xe1 extra: all_demand_data_rd Demand Data Read requests ++ 0xe2 extra: all_rfo RFO requests to L2 cache ++ 0xe4 extra: all_code_rd L2 code requests ++ 0xf8 extra: all_pf Requests from the L1/L2/L3 hardware prefetchers or Load software prefetches ++ 0x38 extra: pf_miss Requests from the L1/L2/L3 hardware prefetchers or Load software prefetches that miss L2 cache ++ 0xd8 extra: pf_hit Requests from the L1/L2/L3 hardware prefetchers or Load software prefetches that hit L2 cache ++ 0x42 extra: rfo_hit RFO requests that hit L2 cache ++ 0x22 extra: rfo_miss RFO requests that miss L2 cache ++ 0x44 extra: code_rd_hit L2 cache hits when fetching instructions, code reads. ++ 0x24 extra: code_rd_miss L2 cache misses when fetching instructions ++ 0x27 extra: all_demand_miss Demand requests that miss L2 cache ++ 0xe7 extra: all_demand_references Demand requests to L2 cache ++ 0x3f extra: miss All requests that miss L2 cache ++ 0xff extra: references All L2 requests ++name:longest_lat_cache type:exclusive default:0x41 ++ 0x41 extra: miss Core-originated cacheable demand requests missed L3 ++ 0x4f extra: reference Core-originated cacheable demand requests that refer to L3 ++name:cpu_clk_thread_unhalted type:exclusive default:ref_xclk ++ 0x1 extra: ref_xclk Reference cycles when the thread is unhalted (counts at 100 MHz rate) ++ 0x2 extra: one_thread_active Count XClk pulses when this thread is unhalted and the other thread is halted. ++ 0x1 extra:any ref_xclk_any Reference cycles when the at least one thread on the physical core is unhalted (counts at 100 MHz rate) ++name:l1d_pend_miss type:exclusive default:pending ++ 0x1 extra: pending L1D miss oustandings duration in cycles ++ 0x2 extra: fb_full Number of times a request needed a FB entry but there was no entry available for it. That is the FB unavailability was dominant reason for blocking the request. A request includes cacheable/uncacheable demands that is load, store or SW prefetch. HWP are e ++ 0x1 extra:cmask=1 pending_cycles Cycles with L1D load Misses outstanding. ++ 0x1 extra:cmask=1,any pending_cycles_any Cycles with L1D load Misses outstanding from any thread on physical core ++name:dtlb_store_misses type:exclusive default:miss_causes_a_walk ++ 0x1 extra: miss_causes_a_walk Store misses in all DTLB levels that cause page walks ++ 0x10 extra: walk_pending Counts 1 per cycle for each PMH that is busy with a page walk for a store. ++ 0x20 extra: stlb_hit Stores that miss the DTLB and hit the STLB. ++ 0xe extra: walk_completed Store misses in all TLB levels causes a page walk that completes. (All page sizes) ++ 0x10 extra:cmask=1 walk_active Cycles when at least one PMH is busy with a page walk for a store. ++name:tx_mem type:exclusive default:0x1 ++ 0x1 extra: abort_conflict Number of times a transactional abort was signaled due to a data conflict on a transactionally accessed address ++ 0x2 extra: abort_capacity Number of times a transactional abort was signaled due to a data capacity limitation for transactional reads or writes. ++ 0x4 extra: abort_hle_store_to_elided_lock Number of times a HLE transactional region aborted due to a non XRELEASE prefixed instruction writing to an elided lock in the elision buffer ++ 0x8 extra: abort_hle_elision_buffer_not_empty Number of times an HLE transactional execution aborted due to NoAllocatedElisionBuffer being non-zero. ++ 0x10 extra: abort_hle_elision_buffer_mismatch Number of times an HLE transactional execution aborted due to XRELEASE lock not satisfying the address and value requirements in the elision buffer ++ 0x20 extra: abort_hle_elision_buffer_unsupported_alignment Number of times an HLE transactional execution aborted due to an unsupported read alignment from the elision buffer. ++ 0x40 extra: hle_elision_buffer_full Number of times HLE lock could not be elided due to ElisionBufferAvailable being zero. ++name:tx_exec type:exclusive default:0x1 ++ 0x1 extra: misc1 Counts the number of times a class of instructions that may cause a transactional abort was executed. Since this is the count of execution, it may not always cause a transactional abort. ++ 0x2 extra: misc2 Counts the number of times a class of instructions (e.g., vzeroupper) that may cause a transactional abort was executed inside a transactional region ++ 0x4 extra: misc3 Counts the number of times an instruction execution caused the transactional nest count supported to be exceeded ++ 0x8 extra: misc4 Counts the number of times a XBEGIN instruction was executed inside an HLE transactional region. ++ 0x10 extra: misc5 Counts the number of times an HLE XACQUIRE instruction was executed inside an RTM transactional region ++name:rs_events type:exclusive default:empty_cycles ++ 0x1 extra: empty_cycles Cycles when Reservation Station (RS) is empty for the thread ++ 0x1 extra:cmask=1,inv,edge empty_end Counts end of periods where the Reservation Station (RS) was empty. Could be useful to precisely locate Frontend Latency Bound issues. ++name:offcore_requests_outstanding type:exclusive default:demand_data_rd ++ 0x1 extra: demand_data_rd Offcore outstanding Demand Data Read transactions in uncore queue. ++ 0x2 extra:cmask=1 demand_code_rd Cycles with offcore outstanding Code Reads transactions in the SuperQueue (SQ), queue to uncore, every cycle. ++ 0x4 extra:cmask=1 demand_rfo Offcore outstanding demand rfo reads transactions in SuperQueue (SQ), queue to uncore, every cycle ++ 0x8 extra: all_data_rd Offcore outstanding cacheable Core Data Read transactions in SuperQueue (SQ), queue to uncore ++ 0x10 extra: l3_miss_demand_data_rd Counts number of Offcore outstanding Demand Data Read requests who miss L3 cache in the superQ every cycle. ++ 0x1 extra:cmask=1 cycles_with_demand_data_rd Cycles when offcore outstanding Demand Data Read transactions are present in SuperQueue (SQ), queue to uncore ++ 0x8 extra:cmask=1 cycles_with_data_rd Cycles when offcore outstanding cacheable Core Data Read transactions are present in SuperQueue (SQ), queue to uncore ++ 0x2 extra:cmask=1 cycles_with_demand_code_rd Cycles with offcore outstanding Code Reads transactions in the SuperQueue (SQ), queue to uncore, every cycle. ++ 0x4 extra:cmask=1 cycles_with_demand_rfo Offcore outstanding demand rfo reads transactions in SuperQueue (SQ), queue to uncore, every cycle ++ 0x10 extra:cmask=1 cycles_with_l3_miss_demand_data_rd Cycles with at least 1 Demand Data Read requests who miss L3 cache in the superQ ++ 0x10 extra:cmask=6 l3_miss_demand_data_rd_ge_6 Cycles with at least 6 Demand Data Read requests who miss L3 cache in the superQ ++ 0x1 extra:cmask=6 demand_data_rd_ge_6 Cycles with at least 6 offcore outstanding Demand Data Read transactions in uncore queue ++name:idq type:exclusive default:mite_uops ++ 0x4 extra: mite_uops Uops delivered to Instruction Decode Queue (IDQ) from MITE path ++ 0x8 extra: dsb_uops Uops delivered to Instruction Decode Queue (IDQ) from the Decode Stream Buffer (DSB) path ++ 0x20 extra: ms_mite_uops Uops initiated by MITE and delivered to Instruction Decode Queue (IDQ) while Microcode Sequenser (MS) is busy ++ 0x30 extra:cmask=1 ms_cycles Cycles when uops are being delivered to Instruction Decode Queue (IDQ) while Microcode Sequenser (MS) is busy ++ 0x4 extra:cmask=1 mite_cycles Cycles when uops are being delivered to Instruction Decode Queue (IDQ) from MITE path ++ 0x8 extra:cmask=1 dsb_cycles Cycles when uops are being delivered to Instruction Decode Queue (IDQ) from Decode Stream Buffer (DSB) path ++ 0x10 extra:cmask=1 ms_dsb_cycles Cycles when uops initiated by Decode Stream Buffer (DSB) are being delivered to Instruction Decode Queue (IDQ) while Microcode Sequenser (MS) is busy ++ 0x18 extra:cmask=4 all_dsb_cycles_4_uops Cycles Decode Stream Buffer (DSB) is delivering 4 Uops ++ 0x18 extra:cmask=1 all_dsb_cycles_any_uops Cycles Decode Stream Buffer (DSB) is delivering any Uop ++ 0x24 extra:cmask=4 all_mite_cycles_4_uops Cycles MITE is delivering 4 Uops ++ 0x24 extra:cmask=1 all_mite_cycles_any_uops Cycles MITE is delivering any Uop ++ 0x30 extra:cmask=1,edge ms_switches Number of switches from DSB (Decode Stream Buffer) or MITE (legacy decode pipeline) to the Microcode Sequencer ++ 0x30 extra: ms_uops Uops delivered to Instruction Decode Queue (IDQ) while Microcode Sequenser (MS) is busy ++name:icache_64b type:exclusive default:0x1 ++ 0x1 extra: iftag_hit Instruction fetch tag lookups that hit in the instruction cache (L1I). Counts at 64-byte cache-line granularity. ++ 0x2 extra: iftag_miss Instruction fetch tag lookups that miss in the instruction cache (L1I). Counts at 64-byte cache-line granularity. ++ 0x4 extra: iftag_stall Cycles where a code fetch is stalled due to L1 instruction cache tag miss. ++name:itlb_misses type:exclusive default:0x1 ++ 0x1 extra: miss_causes_a_walk Misses at all ITLB levels that cause page walks ++ 0x10 extra: walk_pending Counts 1 per cycle for each PMH that is busy with a page walk for an instruction fetch request. ++ 0x20 extra: stlb_hit Intruction fetch requests that miss the ITLB and hit the STLB. ++ 0xe extra: walk_completed Code miss in all TLB levels causes a page walk that completes. (All page sizes) ++name:idq_uops_not_delivered type:exclusive default:core ++ 0x1 extra: core Uops not delivered to Resource Allocation Table (RAT) per thread when backend of the machine is not stalled ++ 0x1 extra:cmask=4 cycles_0_uops_deliv_core Cycles per thread when 4 or more uops are not delivered to Resource Allocation Table (RAT) when backend of the machine is not stalled ++ 0x1 extra:cmask=3 cycles_le_1_uop_deliv_core Cycles per thread when 3 or more uops are not delivered to Resource Allocation Table (RAT) when backend of the machine is not stalled ++ 0x1 extra:cmask=2 cycles_le_2_uop_deliv_core Cycles with less than 2 uops delivered by the front end. ++ 0x1 extra:cmask=1 cycles_le_3_uop_deliv_core Cycles with less than 3 uops delivered by the front end. ++ 0x1 extra:cmask=1,inv cycles_fe_was_ok Counts cycles FE delivered 4 uops or Resource Allocation Table (RAT) was stalling FE. ++name:uops_dispatched_port type:exclusive default:0x1 ++ 0x1 extra: port_0 Cycles per thread when uops are executed in port 0 ++ 0x2 extra: port_1 Cycles per thread when uops are executed in port 1 ++ 0x4 extra: port_2 Cycles per thread when uops are executed in port 2 ++ 0x8 extra: port_3 Cycles per thread when uops are executed in port 3 ++ 0x10 extra: port_4 Cycles per thread when uops are executed in port 4 ++ 0x20 extra: port_5 Cycles per thread when uops are executed in port 5 ++ 0x40 extra: port_6 Cycles per thread when uops are executed in port 6 ++ 0x80 extra: port_7 Cycles per thread when uops are executed in port 7 ++name:resource_stalls type:exclusive default:0x1 ++ 0x1 extra: any Resource-related stall cycles ++ 0x8 extra: sb Cycles stalled due to no store buffers available. (not including draining form sync). ++name:cycle_activity type:exclusive default:0x4 ++ 0x4 extra:cmask=4 stalls_total Total execution stalls. ++ 0x8 extra:cmask=8 cycles_l1d_miss Cycles while L1 cache miss demand load is outstanding. ++ 0xc extra:cmask=c stalls_l1d_miss Execution stalls while L1 cache miss demand load is outstanding. ++ 0x1 extra:cmask=1 cycles_l2_miss Cycles while L2 cache miss demand load is outstanding. ++ 0x5 extra:cmask=5 stalls_l2_miss Execution stalls while L2 cache miss demand load is outstanding. ++ 0x10 extra:cmask=10 cycles_mem_any Cycles while memory subsystem has an outstanding load. ++ 0x14 extra:cmask=14 stalls_mem_any Execution stalls while memory subsystem has an outstanding load. ++ 0x2 extra:cmask=2 cycles_l3_miss Cycles while L3 cache miss demand load is outstanding. ++ 0x6 extra:cmask=6 stalls_l3_miss Execution stalls while L3 cache miss demand load is outstanding. ++name:exe_activity type:exclusive default:0x1 ++ 0x1 extra: exe_bound_0_ports Cycles where no uops were executed, the Reservation Station was not empty, the Store Buffer was full and there was no outstanding load. ++ 0x2 extra: u1_ports_util Cycles total of 1 uop is executed on all ports and Reservation Station was not empty. ++ 0x4 extra: u2_ports_util Cycles total of 2 uops are executed on all ports and Reservation Station was not empty. ++ 0x8 extra: u3_ports_util Cycles total of 3 uops are executed on all ports and Reservation Station was not empty. ++ 0x10 extra: u4_ports_util Cycles total of 4 uops are executed on all ports and Reservation Station was not empty. ++ 0x40 extra: bound_on_stores Cycles where the Store Buffer was full and no outstanding load. ++name:lsd type:exclusive default:uops ++ 0x1 extra: uops Number of Uops delivered by the LSD. ++ 0x1 extra:cmask=1 cycles_active Cycles Uops delivered by the LSD, but didn't come from the decoder ++ 0x1 extra:cmask=4 cycles_4_uops Cycles 4 Uops delivered by the LSD, but didn't come from the decoder ++name:offcore_requests type:exclusive default:0x80 ++ 0x80 extra: all_requests Any memory transaction that reached the SQ. ++ 0x1 extra: demand_data_rd Demand Data Read requests sent to uncore ++ 0x2 extra: demand_code_rd Cacheable and noncachaeble code read requests ++ 0x4 extra: demand_rfo Demand RFO requests including regular RFOs, locks, ItoM ++ 0x8 extra: all_data_rd Demand and prefetch data reads ++ 0x10 extra: l3_miss_demand_data_rd Demand Data Read requests who miss L3 cache ++name:uops_executed type:exclusive default:thread ++ 0x1 extra: thread Counts the number of uops to be executed per-thread each cycle. ++ 0x2 extra: core Number of uops executed on the core. ++ 0x10 extra: x87 Counts the number of x87 uops dispatched. ++ 0x1 extra:cmask=1,inv stall_cycles Counts number of cycles no uops were dispatched to be executed on this thread. ++ 0x1 extra:cmask=1 cycles_ge_1_uop_exec Cycles where at least 1 uop was executed per-thread ++ 0x1 extra:cmask=2 cycles_ge_2_uops_exec Cycles where at least 2 uops were executed per-thread ++ 0x1 extra:cmask=3 cycles_ge_3_uops_exec Cycles where at least 3 uops were executed per-thread ++ 0x1 extra:cmask=4 cycles_ge_4_uops_exec Cycles where at least 4 uops were executed per-thread ++ 0x2 extra:cmask=1 core_cycles_ge_1 Cycles at least 1 micro-op is executed from any thread on physical core ++ 0x2 extra:cmask=2 core_cycles_ge_2 Cycles at least 2 micro-op is executed from any thread on physical core ++ 0x2 extra:cmask=3 core_cycles_ge_3 Cycles at least 3 micro-op is executed from any thread on physical core ++ 0x2 extra:cmask=4 core_cycles_ge_4 Cycles at least 4 micro-op is executed from any thread on physical core ++ 0x2 extra:cmask=1,inv core_cycles_none Cycles with no micro-ops executed from any thread on physical core ++name:tlb_flush type:exclusive default:0x1 ++ 0x1 extra: dtlb_thread DTLB flush attempts of the thread-specific entries ++ 0x20 extra: stlb_any STLB flush attempts ++name:uops_retired type:exclusive default:retire_slots ++ 0x2 extra: retire_slots Retirement slots used. ++ 0x1 extra:cmask=1,inv stall_cycles Cycles without actually retired uops. ++ 0x1 extra:cmask=a,inv total_cycles Cycles with less than 10 actually retired uops. ++name:machine_clears type:exclusive default:0x1 ++ 0x1 extra:cmask=1,edge count Number of machine clears (nukes) of any type. ++ 0x2 extra: memory_ordering Counts the number of machine clears due to memory order conflicts. ++ 0x4 extra: smc Self-modifying code (SMC) detected. ++name:br_inst_retired type:exclusive default:all_branches ++ 0x0 extra: all_branches All (macro) branch instructions retired. ++ 0x1 extra: conditional Conditional branch instructions retired. ++ 0x1 extra:pebs conditional_pebs Conditional branch instructions retired. ++ 0x2 extra: near_call Direct and indirect near call instructions retired. ++ 0x2 extra:pebs near_call_pebs Direct and indirect near call instructions retired. ++ 0x8 extra: near_return Return instructions retired. ++ 0x8 extra:pebs near_return_pebs Return instructions retired. ++ 0x10 extra: not_taken Not taken branch instructions retired. ++ 0x20 extra: near_taken Taken branch instructions retired. ++ 0x20 extra:pebs near_taken_pebs Taken branch instructions retired. ++ 0x40 extra: far_branch Far branch instructions retired. ++ 0x40 extra:pebs far_branch_pebs Far branch instructions retired. ++ 0x4 extra:pebs all_branches_pebs All (macro) branch instructions retired. ++name:br_misp_retired type:exclusive default:all_branches ++ 0x0 extra: all_branches All mispredicted macro branch instructions retired. ++ 0x1 extra: conditional Mispredicted conditional branch instructions retired. ++ 0x1 extra:pebs conditional_pebs Mispredicted conditional branch instructions retired. ++ 0x20 extra: near_taken number of near branch instructions retired that were mispredicted and taken. ++ 0x20 extra:pebs near_taken_pebs number of near branch instructions retired that were mispredicted and taken. ++ 0x4 extra:pebs all_branches_pebs Mispredicted macro branch instructions retired. ++name:fp_arith_inst_retired type:exclusive default:0x1 ++ 0x1 extra: scalar_double Number of SSE/AVX computational scalar double precision floating-point instructions retired. Each count represents 1 computation. Applies to SSE* and AVX* scalar double precision floating-point instructions: ADD SUB MUL DIV MIN MAX SQRT FM(N)ADD/SUB. FM(N)ADD/SUB instructions count twice as they perform multiple calculations per element. ++ 0x2 extra: scalar_single Number of SSE/AVX computational scalar single precision floating-point instructions retired. Each count represents 1 computation. Applies to SSE* and AVX* scalar single precision floating-point instructions: ADD SUB MUL DIV MIN MAX RCP RSQRT SQRT FM(N)ADD/SUB. FM(N)ADD/SUB instructions count twice as they perform multiple calculations per element. ++ 0x4 extra: u128b_packed_double Number of SSE/AVX computational 128-bit packed double precision floating-point instructions retired. Each count represents 2 computations. Applies to SSE* and AVX* packed double precision floating-point instructions: ADD SUB MUL DIV MIN MAX SQRT DPP FM(N)ADD/SUB. DPP and FM(N)ADD/SUB instructions count twice as they perform multiple calculations per element. ++ 0x8 extra: u128b_packed_single Number of SSE/AVX computational 128-bit packed single precision floating-point instructions retired. Each count represents 4 computations. Applies to SSE* and AVX* packed single precision floating-point instructions: ADD SUB MUL DIV MIN MAX RCP RSQRT SQRT DPP FM(N)ADD/SUB. DPP and FM(N)ADD/SUB instructions count twice as they perform multiple calculations per element. ++ 0x10 extra: u256b_packed_double Number of SSE/AVX computational 256-bit packed double precision floating-point instructions retired. Each count represents 4 computations. Applies to SSE* and AVX* packed double precision floating-point instructions: ADD SUB MUL DIV MIN MAX SQRT DPP FM(N)ADD/SUB. DPP and FM(N)ADD/SUB instructions count twice as they perform multiple calculations per element. ++ 0x20 extra: u256b_packed_single Number of SSE/AVX computational 256-bit packed single precision floating-point instructions retired. Each count represents 8 computations. Applies to SSE* and AVX* packed single precision floating-point instructions: ADD SUB MUL DIV MIN MAX RCP RSQRT SQRT DPP FM(N)ADD/SUB. DPP and FM(N)ADD/SUB instructions count twice as they perform multiple calculations per element. ++name:hle_retired type:exclusive default:start ++ 0x1 extra: start Number of times an HLE execution started. ++ 0x2 extra: commit Number of times an HLE execution successfully committed ++ 0x4 extra: aborted Number of times an HLE execution aborted due to any reasons (multiple categories may count as one). ++ 0x4 extra:pebs aborted_pebs Number of times an HLE execution aborted due to any reasons (multiple categories may count as one). ++ 0x8 extra: aborted_misc1 Number of times an HLE execution aborted due to various memory events (e.g., read/write capacity and conflicts). ++ 0x10 extra: aborted_misc2 Number of times an HLE execution aborted due to hardware timer expiration. ++ 0x20 extra: aborted_misc3 Number of times an HLE execution aborted due to HLE-unfriendly instructions and certain unfriendly events (such as AD assists etc.). ++ 0x40 extra: aborted_misc4 Number of times an HLE execution aborted due to incompatible memory type ++ 0x80 extra: aborted_misc5 Number of times an HLE execution aborted due to unfriendly events (such as interrupts). ++name:rtm_retired type:exclusive default:start ++ 0x1 extra: start Number of times an RTM execution started. ++ 0x2 extra: commit Number of times an RTM execution successfully committed ++ 0x4 extra: aborted Number of times an RTM execution aborted due to any reasons (multiple categories may count as one). ++ 0x4 extra:pebs aborted_pebs Number of times an RTM execution aborted due to any reasons (multiple categories may count as one). ++ 0x8 extra: aborted_misc1 Number of times an RTM execution aborted due to various memory events (e.g. read/write capacity and conflicts) ++ 0x10 extra: aborted_misc2 Number of times an RTM execution aborted due to uncommon conditions. ++ 0x20 extra: aborted_misc3 Number of times an RTM execution aborted due to HLE-unfriendly instructions ++ 0x40 extra: aborted_misc4 Number of times an RTM execution aborted due to incompatible memory type ++ 0x80 extra: aborted_misc5 Number of times an RTM execution aborted due to none of the previous 4 categories (e.g. interrupt) ++name:mem_inst_retired type:exclusive default:stlb_miss_loads ++ 0x11 extra: stlb_miss_loads Number of load instructions retired with STLB miss ++ 0x11 extra:pebs stlb_miss_loads_pebs Number of load instructions retired with STLB miss ++ 0x12 extra: stlb_miss_stores Number of store instructions retired with STLB miss ++ 0x12 extra:pebs stlb_miss_stores_pebs Number of store instructions retired with STLB miss ++ 0x21 extra: lock_loads Number of lock load instructions retired ++ 0x21 extra:pebs lock_loads_pebs Number of lock load instructions retired ++ 0x41 extra: split_loads Number of load instructions retired with cache-line splits that may impact performance. ++ 0x41 extra:pebs split_loads_pebs Number of load instructions retired with cache-line splits that may impact performance. ++ 0x42 extra: split_stores Number of store instructions retired with line-split ++ 0x42 extra:pebs split_stores_pebs Number of store instructions retired with line-split ++ 0x81 extra: all_loads Number of load instructions retired ++ 0x81 extra:pebs all_loads_pebs Number of load instructions retired ++ 0x82 extra: all_stores Number of store instructions retired ++ 0x82 extra:pebs all_stores_pebs Number of store instructions retired ++name:mem_load_retired type:exclusive default:l1_hit ++ 0x1 extra: l1_hit Retired load instructions with L1 cache hits as data sources ++ 0x1 extra:pebs l1_hit_pebs Retired load instructions with L1 cache hits as data sources ++ 0x2 extra: l2_hit Retired load instructions with L2 cache hits as data sources ++ 0x2 extra:pebs l2_hit_pebs Retired load instructions with L2 cache hits as data sources ++ 0x4 extra: l3_hit Retired load instructions with L3 cache hits as data sources ++ 0x4 extra:pebs l3_hit_pebs Retired load instructions with L3 cache hits as data sources ++ 0x8 extra: l1_miss Retired load instructions missed L1 cache as data sources ++ 0x8 extra:pebs l1_miss_pebs Retired load instructions missed L1 cache as data sources ++ 0x10 extra: l2_miss Retired load instructions missed L2 cache as data sources ++ 0x10 extra:pebs l2_miss_pebs Retired load instructions missed L2 cache as data sources ++ 0x20 extra: l3_miss Retired load instructions missed L3 cache as data sources ++ 0x20 extra:pebs l3_miss_pebs Retired load instructions missed L3 cache as data sources ++ 0x40 extra: fb_hit Retired load instructions which data sources were load missed L1 but hit FB due to preceding miss to the same cache line with data not ready ++ 0x40 extra:pebs fb_hit_pebs Retired load instructions which data sources were load missed L1 but hit FB due to preceding miss to the same cache line with data not ready ++name:mem_load_l3_hit_retired type:exclusive default:xsnp_miss ++ 0x1 extra: xsnp_miss Retired load instructions which data sources were L3 hit and cross-core snoop missed in on-pkg core cache. ++ 0x1 extra:pebs xsnp_miss_pebs Retired load instructions which data sources were L3 hit and cross-core snoop missed in on-pkg core cache. ++ 0x2 extra: xsnp_hit Retired load instructions which data sources were L3 and cross-core snoop hits in on-pkg core cache ++ 0x2 extra:pebs xsnp_hit_pebs Retired load instructions which data sources were L3 and cross-core snoop hits in on-pkg core cache ++ 0x4 extra: xsnp_hitm Retired load instructions which data sources were HitM responses from shared L3 ++ 0x4 extra:pebs xsnp_hitm_pebs Retired load instructions which data sources were HitM responses from shared L3 ++ 0x8 extra: xsnp_none Retired load instructions which data sources were hits in L3 without snoops required ++ 0x8 extra:pebs xsnp_none_pebs Retired load instructions which data sources were hits in L3 without snoops required +diff --git a/libop/op_cpu_type.c b/libop/op_cpu_type.c +index 24ed697..b1d5ecf 100644 +--- a/libop/op_cpu_type.c ++++ b/libop/op_cpu_type.c +@@ -121,6 +121,7 @@ static struct cpu_descr const cpu_descrs[MAX_CPU_TYPE] = { + { "APM X-Gene", "arm/armv8-xgene", CPU_ARM_V8_APM_XGENE, 6 }, + { "ARM Cortex-A57", "arm/armv8-ca57", CPU_ARM_V8_CA57, 6}, + { "ARM Cortex-A53", "arm/armv8-ca53", CPU_ARM_V8_CA53, 6}, ++ { "Intel Skylake microarchitecture", "i386/skylake", CPU_SKYLAKE, 4 }, + }; + + static size_t const nr_cpu_descrs = sizeof(cpu_descrs) / sizeof(struct cpu_descr); +@@ -737,6 +738,7 @@ op_cpu op_cpu_base_type(op_cpu cpu_type) + case CPU_NEHALEM: + case CPU_HASWELL: + case CPU_BROADWELL: ++ case CPU_SKYLAKE: + case CPU_SILVERMONT: + case CPU_WESTMERE: + case CPU_SANDYBRIDGE: +diff --git a/libop/op_cpu_type.h b/libop/op_cpu_type.h +index 2bd00ce..9983f87 100644 +--- a/libop/op_cpu_type.h ++++ b/libop/op_cpu_type.h +@@ -101,6 +101,7 @@ typedef enum { + CPU_ARM_V8_APM_XGENE, /* APM X-Gene */ + CPU_ARM_V8_CA57, /* ARM Cortex-A57 */ + CPU_ARM_V8_CA53, /* ARM Cortex-A53 */ ++ CPU_SKYLAKE, /** < Intel Skylake microarchitecture */ + MAX_CPU_TYPE + } op_cpu; + +diff --git a/libop/op_events.c b/libop/op_events.c +index 2badc8e..f58d243 100644 +--- a/libop/op_events.c ++++ b/libop/op_events.c +@@ -1200,6 +1200,7 @@ void op_default_event(op_cpu cpu_type, struct op_default_event_descr * descr) + case CPU_NEHALEM: + case CPU_HASWELL: + case CPU_BROADWELL: ++ case CPU_SKYLAKE: + case CPU_SILVERMONT: + case CPU_WESTMERE: + case CPU_SANDYBRIDGE: +diff --git a/libop/op_hw_specific.h b/libop/op_hw_specific.h +index 8a7ed1c..994fec4 100644 +--- a/libop/op_hw_specific.h ++++ b/libop/op_hw_specific.h +@@ -154,6 +154,9 @@ static inline op_cpu op_cpu_specific_type(op_cpu cpu_type) + case 0x4f: + case 0x56: + return CPU_BROADWELL; ++ case 0x4e: ++ case 0x5e: ++ return CPU_SKYLAKE; + case 0x37: + case 0x4d: + case 0x4c: +diff --git a/utils/ophelp.c b/utils/ophelp.c +index a80fec8..fdddddc 100644 +--- a/utils/ophelp.c ++++ b/utils/ophelp.c +@@ -542,6 +542,7 @@ int main(int argc, char const * argv[]) + case CPU_NEHALEM: + case CPU_HASWELL: + case CPU_BROADWELL: ++ case CPU_SKYLAKE: + case CPU_SILVERMONT: + case CPU_WESTMERE: + case CPU_SANDYBRIDGE: +-- +2.4.3 + +From ccc38adf33e3ae845e0b7c4f8fe77beceaa7b930 Mon Sep 17 00:00:00 2001 +From: Andi Kleen +Date: Mon, 6 Jul 2015 16:48:25 -0700 +Subject: [PATCH 2/3] oprofile: Fixes for Skylake event lists + +This fixes the review feedback for the Skylake event list. + +- Fix event codes for INST_RETIRED, CPU_CLK_UNHALTED. +- Fix OFFCORE_REQUESTS_OUTSTANDING events +- Add br_inst_retired.all_branches_pebs +- Fill in correct default event +--- + events/i386/skylake/events | 4 ++-- + events/i386/skylake/unit_masks | 25 +++++++++++++------------ + libop/op_events.c | 5 ++++- + 3 files changed, 19 insertions(+), 15 deletions(-) + +diff --git a/events/i386/skylake/events b/events/i386/skylake/events +index 28d6654..9a04a86 100644 +--- a/events/i386/skylake/events ++++ b/events/i386/skylake/events +@@ -6,8 +6,6 @@ + # Note the minimum counts are not discovered experimentally and could be likely + # lowered in many cases without ill effect. + # +-event:0x00 counters:1 um:inst_retired minimum:2000003 name:inst_retired : +-event:0x00 counters:cpuid um:cpu_clk_unhalted minimum:2000003 name:cpu_clk_unhalted : + event:0x03 counters:cpuid um:ld_blocks minimum:100003 name:ld_blocks : + event:0x07 counters:cpuid um:ld_blocks_partial minimum:100003 name:ld_blocks_partial_address_alias : + event:0x08 counters:cpuid um:dtlb_load_misses minimum:2000003 name:dtlb_load_misses : +@@ -16,6 +14,7 @@ event:0x0e counters:cpuid um:uops_issued minimum:2000003 name:uops_issued : + event:0x14 counters:cpuid um:arith minimum:2000003 name:arith_divider_active : + event:0x24 counters:cpuid um:l2_rqsts minimum:200003 name:l2_rqsts : + event:0x2e counters:cpuid um:longest_lat_cache minimum:100003 name:longest_lat_cache : ++event:0x3c counters:cpuid um:cpu_clk_unhalted minimum:2000003 name:cpu_clk_unhalted : + event:0x3c counters:cpuid um:cpu_clk_thread_unhalted minimum:2000003 name:cpu_clk_thread_unhalted : + event:0x48 counters:cpuid um:l1d_pend_miss minimum:2000003 name:l1d_pend_miss : + event:0x49 counters:cpuid um:dtlb_store_misses minimum:2000003 name:dtlb_store_misses : +@@ -44,6 +43,7 @@ event:0xb0 counters:cpuid um:offcore_requests minimum:100003 name:offcore_reques + event:0xb1 counters:cpuid um:uops_executed minimum:2000003 name:uops_executed : + event:0xb2 counters:cpuid um:offcore_requests_buffer minimum:2000003 name:offcore_requests_buffer_sq_full : + event:0xbd counters:cpuid um:tlb_flush minimum:100007 name:tlb_flush : ++event:0xc0 counters:1 um:inst_retired minimum:2000003 name:inst_retired : + event:0xc1 counters:cpuid um:other_assists minimum:100003 name:other_assists_any : + event:0xc2 counters:cpuid um:uops_retired minimum:2000003 name:uops_retired : + event:0xc3 counters:cpuid um:machine_clears minimum:100003 name:machine_clears : +diff --git a/events/i386/skylake/unit_masks b/events/i386/skylake/unit_masks +index 98ed65c..b505769 100644 +--- a/events/i386/skylake/unit_masks ++++ b/events/i386/skylake/unit_masks +@@ -37,16 +37,6 @@ name:offcore_requests_buffer type:mandatory default:0x1 + 0x1 extra: sq_full Offcore requests buffer cannot take more entries for this thread core. + name:other_assists type:mandatory default:0x3f + 0x3f extra: any Number of times a microcode assist is invoked by HW other than FP-assist. Examples include AD (page Access Dirty) and AVX* related assists. +-name:inst_retired type:exclusive default:any +- 0x1 extra: any Instructions retired from execution.mem +- 0x0 extra: any_p Number of instructions retired. General Counter - architectural event +- 0x1 extra:pebs prec_dist Precise instruction retired event with HW to reduce effect of PEBS shadow in IP distribution +-name:cpu_clk_unhalted type:exclusive default:thread +- 0x2 extra: thread Core cycles when the thread is not in halt state +- 0x3 extra: ref_tsc Reference cycles when the core is not in halt state. +- 0x0 extra: thread_p Thread cycles when thread is not in halt state +- 0x2 extra:any thread_any Core cycles when at least one thread on the physical core is not in halt state +- 0x0 extra:any thread_p_any Core cycles when at least one thread on the physical core is not in halt state + name:ld_blocks type:exclusive default:0x2 + 0x2 extra: store_forward loads blocked by overlapping with store buffer that cannot be forwarded . + 0x8 extra: no_sr The number of times that split load operations are temporarily blocked because all resources for handling the split accesses are in use +@@ -85,6 +75,12 @@ name:l2_rqsts type:exclusive default:0x21 + name:longest_lat_cache type:exclusive default:0x41 + 0x41 extra: miss Core-originated cacheable demand requests missed L3 + 0x4f extra: reference Core-originated cacheable demand requests that refer to L3 ++name:cpu_clk_unhalted type:exclusive default:thread ++ 0x2 extra: thread Core cycles when the thread is not in halt state ++ 0x3 extra: ref_tsc Reference cycles when the core is not in halt state. ++ 0x0 extra: thread_p Thread cycles when thread is not in halt state ++ 0x2 extra:any thread_any Core cycles when at least one thread on the physical core is not in halt state ++ 0x0 extra:any thread_p_any Core cycles when at least one thread on the physical core is not in halt state + name:cpu_clk_thread_unhalted type:exclusive default:ref_xclk + 0x1 extra: ref_xclk Reference cycles when the thread is unhalted (counts at 100 MHz rate) + 0x2 extra: one_thread_active Count XClk pulses when this thread is unhalted and the other thread is halted. +@@ -119,8 +115,8 @@ name:rs_events type:exclusive default:empty_cycles + 0x1 extra:cmask=1,inv,edge empty_end Counts end of periods where the Reservation Station (RS) was empty. Could be useful to precisely locate Frontend Latency Bound issues. + name:offcore_requests_outstanding type:exclusive default:demand_data_rd + 0x1 extra: demand_data_rd Offcore outstanding Demand Data Read transactions in uncore queue. +- 0x2 extra:cmask=1 demand_code_rd Cycles with offcore outstanding Code Reads transactions in the SuperQueue (SQ), queue to uncore, every cycle. +- 0x4 extra:cmask=1 demand_rfo Offcore outstanding demand rfo reads transactions in SuperQueue (SQ), queue to uncore, every cycle ++ 0x2 extra: demand_code_rd Offcore outstanding Code Reads transactions in the SuperQueue (SQ), queue to uncore. ++ 0x4 extra: demand_rfo Offcore outstanding demand rfo reads transactions in SuperQueue (SQ), queue to uncore, every cycle + 0x8 extra: all_data_rd Offcore outstanding cacheable Core Data Read transactions in SuperQueue (SQ), queue to uncore + 0x10 extra: l3_miss_demand_data_rd Counts number of Offcore outstanding Demand Data Read requests who miss L3 cache in the superQ every cycle. + 0x1 extra:cmask=1 cycles_with_demand_data_rd Cycles when offcore outstanding Demand Data Read transactions are present in SuperQueue (SQ), queue to uncore +@@ -217,6 +213,10 @@ name:uops_executed type:exclusive default:thread + name:tlb_flush type:exclusive default:0x1 + 0x1 extra: dtlb_thread DTLB flush attempts of the thread-specific entries + 0x20 extra: stlb_any STLB flush attempts ++name:inst_retired type:exclusive default:any ++ 0x1 extra: any Instructions retired from execution.mem ++ 0x0 extra: any_p Number of instructions retired. General Counter - architectural event ++ 0x1 extra:pebs prec_dist Precise instruction retired event with HW to reduce effect of PEBS shadow in IP distribution + name:uops_retired type:exclusive default:retire_slots + 0x2 extra: retire_slots Retirement slots used. + 0x1 extra:cmask=1,inv stall_cycles Cycles without actually retired uops. +@@ -231,6 +231,7 @@ name:br_inst_retired type:exclusive default:all_branches + 0x1 extra:pebs conditional_pebs Conditional branch instructions retired. + 0x2 extra: near_call Direct and indirect near call instructions retired. + 0x2 extra:pebs near_call_pebs Direct and indirect near call instructions retired. ++ 0x0 extra:pebs all_branches_pebs All (macro) branch instructions retired. + 0x8 extra: near_return Return instructions retired. + 0x8 extra:pebs near_return_pebs Return instructions retired. + 0x10 extra: not_taken Not taken branch instructions retired. +diff --git a/libop/op_events.c b/libop/op_events.c +index f58d243..25f010e 100644 +--- a/libop/op_events.c ++++ b/libop/op_events.c +@@ -1200,7 +1200,6 @@ void op_default_event(op_cpu cpu_type, struct op_default_event_descr * descr) + case CPU_NEHALEM: + case CPU_HASWELL: + case CPU_BROADWELL: +- case CPU_SKYLAKE: + case CPU_SILVERMONT: + case CPU_WESTMERE: + case CPU_SANDYBRIDGE: +@@ -1213,6 +1212,10 @@ void op_default_event(op_cpu cpu_type, struct op_default_event_descr * descr) + descr->name = "CPU_CLK_UNHALTED"; + break; + ++ case CPU_SKYLAKE: ++ descr->name = "cpu_clk_unhalted"; ++ break; ++ + case CPU_RTC: + descr->name = "RTC_INTERRUPTS"; + descr->count = 1024; +-- +2.4.3 + +From cfb3ddbaae4ca2e073b5229bf6019da766eb8da9 Mon Sep 17 00:00:00 2001 +From: Andi Kleen +Date: Tue, 7 Jul 2015 11:02:38 -0700 +Subject: [PATCH 3/3] oprofile: Fix unit masks of fixed counters on Skylake + +Fix another issue noticed by William Cohen. + +The unit masks for the fixed counters were incorrect. + +Note that the fixed counters exist in two copies, as perf aliases +them to the generic counter version codes. +--- + events/i386/skylake/unit_masks | 6 +++--- + 1 file changed, 3 insertions(+), 3 deletions(-) + +diff --git a/events/i386/skylake/unit_masks b/events/i386/skylake/unit_masks +index b505769..6e81a63 100644 +--- a/events/i386/skylake/unit_masks ++++ b/events/i386/skylake/unit_masks +@@ -76,8 +76,8 @@ name:longest_lat_cache type:exclusive default:0x41 + 0x41 extra: miss Core-originated cacheable demand requests missed L3 + 0x4f extra: reference Core-originated cacheable demand requests that refer to L3 + name:cpu_clk_unhalted type:exclusive default:thread +- 0x2 extra: thread Core cycles when the thread is not in halt state +- 0x3 extra: ref_tsc Reference cycles when the core is not in halt state. ++ 0x0 extra: thread Core cycles when the thread is not in halt state ++ 0x1 extra: ref_tsc Reference cycles when the core is not in halt state. + 0x0 extra: thread_p Thread cycles when thread is not in halt state + 0x2 extra:any thread_any Core cycles when at least one thread on the physical core is not in halt state + 0x0 extra:any thread_p_any Core cycles when at least one thread on the physical core is not in halt state +@@ -214,7 +214,7 @@ name:tlb_flush type:exclusive default:0x1 + 0x1 extra: dtlb_thread DTLB flush attempts of the thread-specific entries + 0x20 extra: stlb_any STLB flush attempts + name:inst_retired type:exclusive default:any +- 0x1 extra: any Instructions retired from execution.mem ++ 0x0 extra: any Instructions retired from execution.mem + 0x0 extra: any_p Number of instructions retired. General Counter - architectural event + 0x1 extra:pebs prec_dist Precise instruction retired event with HW to reduce effect of PEBS shadow in IP distribution + name:uops_retired type:exclusive default:retire_slots +-- +2.4.3 + +commit 635d1f59ff198a43deb9482cdec10795222e506a +Author: Andi Kleen +Date: Fri Apr 15 13:14:51 2016 -0700 + + Add model number of Skylake server to oprofile + + Just reuse the event list of Skylake client. + + Signed-off-by: Andi Kleen + +diff --git a/libop/op_hw_specific.h b/libop/op_hw_specific.h +index 994fec4..a6180f4 100644 +--- a/libop/op_hw_specific.h ++++ b/libop/op_hw_specific.h +@@ -156,6 +156,7 @@ static inline op_cpu op_cpu_specific_type(op_cpu cpu_type) + return CPU_BROADWELL; + case 0x4e: + case 0x5e: ++ case 0x55: + return CPU_SKYLAKE; + case 0x37: + case 0x4d: +commit 402cad1b6f5605ed854eb8b7b7376cafce3fb007 +Author: Andi Kleen +Date: Fri Apr 29 17:50:25 2016 -0700 + + oprofile: Add model numbers for Kabylake CPUs + + The PMU is using the same events as Skylake, so no other changes. + + Signed-off-by: Andi Kleen + +diff --git a/libop/op_hw_specific.h b/libop/op_hw_specific.h +index f4db8f5..2061760 100644 +--- a/libop/op_hw_specific.h ++++ b/libop/op_hw_specific.h +@@ -157,6 +157,8 @@ static inline op_cpu op_cpu_specific_type(op_cpu cpu_type) + case 0x4e: + case 0x5e: + case 0x55: ++ case 0x8e: ++ case 0x9e: + return CPU_SKYLAKE; + case 0x37: + case 0x4d: diff --git a/SOURCES/oprofile-xml.patch b/SOURCES/oprofile-xml.patch new file mode 100644 index 0000000..abddd48 --- /dev/null +++ b/SOURCES/oprofile-xml.patch @@ -0,0 +1,236 @@ +diff -up oprofile-0.9.9/doc/ophelp.xsd.ophelp oprofile-0.9.9/doc/ophelp.xsd +--- oprofile-0.9.9/doc/ophelp.xsd.ophelp 2014-05-28 10:09:46.279270117 -0400 ++++ oprofile-0.9.9/doc/ophelp.xsd 2014-05-28 10:08:59.416060557 -0400 +@@ -0,0 +1,57 @@ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ +commit a339a069f4ceba748df44d2babd9f08ce06abd78 +Author: Maynard Johnson +Date: Thu Nov 7 08:24:05 2013 -0600 + + ophelp schema is not included in installed files + + A one-line change in doc/Makefile.am was needed in order for + 'make install' to put ophelp.xsd in /share/doc/oprofile. + + Signed-off-by: Maynard Johnson + +diff --git a/doc/Makefile.am b/doc/Makefile.am +index 45fbe92..258842f 100644 +--- a/doc/Makefile.am ++++ b/doc/Makefile.am +@@ -26,7 +26,7 @@ man_MANS += operf.1 \ + endif + + htmldir = $(prefix)/share/doc/oprofile +-dist_html_DATA = oprofile.html internals.html opreport.xsd op-jit-devel.html ++dist_html_DATA = oprofile.html internals.html opreport.xsd ophelp.xsd op-jit-devel.html + + if have_xsltproc + + +commit ed40d8d444a17e7cf16a4653607b04a24f4c0513 +Author: William Cohen +Date: Tue Jan 28 11:05:46 2014 -0600 + + Print unit mask name where applicable in ophelp XML output + + Some Intel architectures have named unit masks and it would be useful + to include the unit mask name in the XML output. This patch also + updates the ophelp.xsd schema file to include the optional unit + mask 'name' field. + + Signed-off-by: William Cohen + +diff --git a/doc/ophelp.xsd b/doc/ophelp.xsd +index 9bd7f82..c07bdb4 100644 +--- a/doc/ophelp.xsd ++++ b/doc/ophelp.xsd +@@ -49,6 +49,7 @@ + + + ++ + + + +diff --git a/libop/op_xml_events.c b/libop/op_xml_events.c +index 3b1af21..de107c2 100644 +--- a/libop/op_xml_events.c ++++ b/libop/op_xml_events.c +@@ -95,6 +95,10 @@ void xml_help_for_event(struct op_event const * event) + close_xml_element(NONE, 1, buffer, MAX_BUFFER); + for (i = 0; i < event->unit->num; i++) { + open_xml_element(HELP_UNIT_MASK, 1, buffer, MAX_BUFFER); ++ if (event->unit->um[i].name) ++ init_xml_str_attr(HELP_UNIT_MASK_NAME, ++ event->unit->um[i].name, ++ buffer, MAX_BUFFER); + init_xml_int_attr(HELP_UNIT_MASK_VALUE, + event->unit->um[i].value, + buffer, MAX_BUFFER); +diff --git a/libop/op_xml_out.c b/libop/op_xml_out.c +index 0b3deea..ac3c97b 100644 +--- a/libop/op_xml_out.c ++++ b/libop/op_xml_out.c +@@ -84,7 +84,8 @@ char const * xml_tag_map[] = { + "unit_mask", + "mask", + "desc", +- "extra" ++ "extra", ++ "name" + }; + + #define MAX_BUF_LEN 2048 +diff --git a/libop/op_xml_out.h b/libop/op_xml_out.h +index 544bd51..6d5a468 100644 +--- a/libop/op_xml_out.h ++++ b/libop/op_xml_out.h +@@ -59,6 +59,7 @@ typedef enum { + HELP_UNIT_MASK_VALUE, + HELP_UNIT_MASK_DESC, + HELP_UNIT_EXTRA_VALUE, ++ HELP_UNIT_MASK_NAME, + } tag_t; + + char const * xml_tag_name(tag_t tag); +commit fd05dade355b482ee9286b7bf90b4b150f49f81c +Author: Maynard Johnson +Date: Mon Feb 3 08:47:30 2014 -0600 + + Remove 'extra' attribute from ophelp XML output; bump schema version + + As discussed on the oprofile mailing list on Sep 24, 2013, there is + no value add in keeping the 'extra' attribute in ophelp's XML output. + The previous commit added the 'name' field to the XML output, and + that is actual valuable information that consumers of the XML output + should use when coding event specifications to pass to operf or + ocount. + + This patch removes the 'extra' attribute and also bumps the schema + version (both in the ophelp.xsd and the XML instance documents). + The schema bump is needed mostly due to removing the 'extra' attribute; + but another reason for it is to draw attention to the new 'name' + attribute, which consumers really must use (when present) in order + to be sure they can properly specify the unitmask that the user + requests. + + Signed-off-by: Maynard Johnson + +diff --git a/doc/ophelp.xsd b/doc/ophelp.xsd +index c07bdb4..1270121 100644 +--- a/doc/ophelp.xsd ++++ b/doc/ophelp.xsd +@@ -11,7 +11,7 @@ + + + +- ++ + + + +diff --git a/libop/op_xml_events.c b/libop/op_xml_events.c +index de107c2..c301732 100644 +--- a/libop/op_xml_events.c ++++ b/libop/op_xml_events.c +@@ -21,7 +21,7 @@ static char buffer[MAX_BUFFER]; + + void open_xml_events(char const * title, char const * doc, op_cpu the_cpu_type) + { +- char const * schema_version = "1.1"; ++ char const * schema_version = "2.0"; + + buffer[0] = '\0'; + cpu_type = the_cpu_type; +@@ -105,10 +105,6 @@ void xml_help_for_event(struct op_event const * event) + init_xml_str_attr(HELP_UNIT_MASK_DESC, + event->unit->um[i].desc, + buffer, MAX_BUFFER); +- if (event->unit->um[i].extra) +- init_xml_int_attr(HELP_UNIT_EXTRA_VALUE, +- event->unit->um[i].extra, +- buffer, MAX_BUFFER); + close_xml_element(NONE, 0, buffer, MAX_BUFFER); + } + close_xml_element(HELP_UNIT_MASKS, 0, buffer, MAX_BUFFER); +diff --git a/libop/op_xml_out.c b/libop/op_xml_out.c +index ac3c97b..63ee41c 100644 +--- a/libop/op_xml_out.c ++++ b/libop/op_xml_out.c +@@ -84,7 +84,6 @@ char const * xml_tag_map[] = { + "unit_mask", + "mask", + "desc", +- "extra", + "name" + }; + +diff --git a/libop/op_xml_out.h b/libop/op_xml_out.h +index 6d5a468..a829f66 100644 +--- a/libop/op_xml_out.h ++++ b/libop/op_xml_out.h +@@ -58,7 +58,6 @@ typedef enum { + HELP_UNIT_MASK, + HELP_UNIT_MASK_VALUE, + HELP_UNIT_MASK_DESC, +- HELP_UNIT_EXTRA_VALUE, + HELP_UNIT_MASK_NAME, + } tag_t; + diff --git a/SOURCES/oprofile-xml2.patch b/SOURCES/oprofile-xml2.patch new file mode 100644 index 0000000..a1a7c17 --- /dev/null +++ b/SOURCES/oprofile-xml2.patch @@ -0,0 +1,134 @@ +commit 5646afee4c74a6759fc61d11b9203b0f6d60f529 +Author: Maynard Johnson +Date: Thu May 29 10:10:41 2014 -0500 + + opreport XML: binary-level count field issues + + See oprofile bug # 236 (https://sourceforge.net/p/oprofile/bugs/236/). + + There are several issues relating to the use of the 'count' element + defined in opreport.xsd. For example, below is the current schema + definition for the 'binary' element. Note the usage of the 'count' + element: + + + + + + + + + + + + + + There have been questions from users whether the 'count' element + associated with the 'binary' element is supposed to represent a + total count across all modules for the executable or if it is only + the count for the executable itself (the answer is the latter). + + Additionally, it's possible that there may be no samples at all + for the binary file -- i.e., all samples collected were for module + elements -- thus, the minOccurs attribute for the 'count' element + of 'binary' should be '0'. + + Finally, using xmllint on a XML instance document created from + opreport on a profile run that specified "--separate-cpu" identified + that the instance document was invalid when compared against its + associated schema file (opreport.xsd). Reviewing the schema, I + realized that all usages of the 'count' element were wrong insofar + as the maxOccurs attribute. Instead of being set to '1', maxOccurs + should be 'unbounded' since we can have multiple 'count' elements + associated with any given higher level element (e.g., 'binary') + if there are multiple classes in the profile. Multiple classes + will exist for a profile for various reasons -- e.g., profiling with + '--separate-cpu', or multiple events. + + This patch addresses these issues. The major version number of the + schema is not being changed -- only the minor number. This is because + instance documents that previously validated using the old schema + will still be valid with the new schema. + + A testsuite patch is being developed to validate XML instance documents + for various scenarios. + + Signed-off-by: Maynard Johnson + +diff --git a/doc/opreport.xsd b/doc/opreport.xsd +index 682a0bf..28e3128 100644 +--- a/doc/opreport.xsd ++++ b/doc/opreport.xsd +@@ -110,7 +110,7 @@ + + + +- ++ + + + +@@ -121,7 +121,7 @@ + + + +- ++ + + + +@@ -131,10 +131,13 @@ + + + +- ++ ++ + +- ++ + + + +@@ -144,7 +147,7 @@ + + + +- ++ + + + +@@ -203,7 +206,7 @@ + + + +- ++ + + + +diff --git a/libpp/xml_utils.cpp b/libpp/xml_utils.cpp +index 942b236..5f1a3a1 100644 +--- a/libpp/xml_utils.cpp ++++ b/libpp/xml_utils.cpp +@@ -245,11 +245,11 @@ void xml_utils::add_option(tag_t tag, bool value) + void xml_utils::output_xml_header(string const & command_options, + string const & cpu_info, string const & events) + { +- // the integer portion indicates the schema version and should change ++ // The integer portion indicates the schema version and should change + // both here and in the schema file when major changes are made to +- // the schema. changes to opreport, or minor changes to the schema ++ // the schema. Changes to opreport, or minor changes to the schema + // can be indicated by changes to the fraction part. +- string const schema_version = "3.0"; ++ string const schema_version = "3.1"; + + // This is the XML version, not schema version. + string const xml_header = ""; diff --git a/SPECS/oprofile.spec b/SPECS/oprofile.spec new file mode 100644 index 0000000..c60786a --- /dev/null +++ b/SPECS/oprofile.spec @@ -0,0 +1,853 @@ +Summary: System wide profiler +Name: oprofile +Version: 0.9.9 +Release: 23%{?dist} +License: GPLv2+ and LGPLv2+ +Group: Development/System +# +Source0: http://downloads.sourceforge.net/%{name}/%{name}-%{version}.tar.gz +Requires: binutils +Requires: which +Requires(pre): shadow-utils +Requires(postun): shadow-utils +Patch10: oprofile-0.4-guess2.patch +Patch83: oprofile-0.9.7-xen.patch +Patch303: oprofile-num_symbolic.patch +Patch304: oprofile-xml.patch +Patch305: oprofile-rhbz1121205.patch +Patch400: oprofile-haswell.patch +Patch401: oprofile-silvermont.patch +Patch402: oprofile-broadwell.patch +Patch403: oprofile-intelcpuid.patch +Patch500: oprofile-aarch64.patch +Patch600: oprofile-power8.patch +Patch601: oprofile-ppc64le.patch +Patch602: oprofile-ppc64-equivalent.patch +Patch700: oprofile-hugepage.patch +Patch800: oprofile-defaultmask.patch +Patch801: oprofile-extramask.patch +Patch802: oprofile-maskarray.patch +Patch803: oprofile-env.patch +Patch804: oprofile-coverity.patch +Patch900: oprofile-ppc64jvm.patch +Patch1000: oprofile-skylake.patch +Patch1001: oprofile-remap.patch +Patch1002: oprofile-xml2.patch +Patch1003: oprofile-goldmont.patch +Patch1004: oprofile-bz1335145.patch +Patch1005: oprofile-bz1264443.patch +Patch1006: oprofile-captest.patch +Patch1007: oprofile-order.patch +Patch1010: oprofile-rhbz1385007.patch +Patch1011: oprofile-rhbz1426426.patch +Patch2000: oprofile-power9.patch + +URL: http://oprofile.sf.net + +#If oprofile doesn't build on an arch, report it and will add ExcludeArch tag. +BuildRequires: qt-devel +BuildRequires: libxslt +BuildRequires: docbook-style-xsl +BuildRequires: docbook-utils +BuildRequires: elinks +BuildRequires: gtk2-devel +BuildRequires: automake +BuildRequires: libtool +BuildRequires: binutils-static +BuildRequires: popt-devel +BuildRequires: java-devel +BuildRequires: jpackage-utils +BuildRequires: java-1.7.0-openjdk-devel +BuildRequires: libpfm-devel >= 4.3.0 + +BuildRoot: %{_tmppath}/%{name}-root + +%description +OProfile is a profiling system for systems running Linux. The +profiling runs transparently during the background, and profile data +can be collected at any time. OProfile makes use of the hardware performance +counters provided on Intel P6, and AMD Athlon family processors, and can use +the RTC for profiling on other x86 processor types. + +See the HTML documentation for further details. + +%package devel +Summary: Header files and libraries for developing apps which will use oprofile +Group: Development/Libraries +Requires: oprofile = %{version}-%{release} +Provides: oprofile-static = %{version}-%{release} + +%description devel + +Header files and libraries for developing apps which will use oprofile. + +%package gui +Summary: GUI for oprofile +Group: Development/System +Requires: oprofile = %{version}-%{release} + +%description gui + +The oprof_start GUI for oprofile. + +%package jit +Summary: Libraries required for profiling Java and other JITed code +Group: Development/System +Requires: oprofile = %{version}-%{release} +#Requires: java >= 1.6 +#Requires: jpackage-utils + +%description jit +This package includes a base JIT support library, as well as a Java +agent library. + +%prep +%setup -q -n %{name}-%{version} +%patch10 -p1 -b .guess2 +%patch83 -p1 -b .xen +%patch303 -p1 -b .num_symbolic +%patch304 -p1 -b .xml +%patch305 -p1 -b .xml +%patch400 -p1 -b .haswell +%patch401 -p1 -b .silvermont +%patch402 -p1 -b .broadwell +%patch403 -p1 +%patch500 -p1 -b .aarch64 +%patch600 -p1 -b .power8 +%patch601 -p1 -b .ppc64le +%patch602 -p1 +%patch700 -p1 +%patch800 -p1 +%patch801 -p1 +%patch802 -p1 +%patch803 -p1 +%patch804 -p1 +%patch900 -p1 +%patch1000 -p1 +%patch1001 -p1 +%patch1002 -p1 +%patch1003 -p1 +%patch1004 -p1 +%patch1005 -p1 -b .archive +%patch1006 -p1 -b .captest +%patch1007 -p1 -b .order +%patch1010 -p1 -b .rhbz1385007 +%patch1011 -p1 -b .rhbz1426426 +%patch2000 -p1 -b .power9 + +./autogen.sh + +%build + +#The CXXFLAGS below is temporary to work around +# bugzilla #113909 +CXXFLAGS=-g; export CXXFLAGS + +%configure \ +--enable-gui=qt4 \ +--with-java=/usr/lib/jvm/java + +make CFLAGS="%{optflags}" + +%install +rm -rf %{buildroot} + +mkdir -p %{buildroot}%{_bindir} +mkdir -p %{buildroot}%{_mandir}/man1 + +make DESTDIR=%{buildroot} INSTALL="install -p" install + +# We want the manuals in the special doc dir, not the generic doc install dir. +# We build it in place and then move it away so it doesn't get installed +# twice. rpm can specify itself where the (versioned) docs go with the +# %%doc directive. +mkdir docs.installed +mv %{buildroot}%{_datadir}/doc/oprofile/* docs.installed/ + +mkdir -p %{buildroot}/etc/ld.so.conf.d +echo "%{_libdir}/oprofile" > %{buildroot}/etc/ld.so.conf.d/oprofile-%{_arch}.conf + +%pre +getent group oprofile >/dev/null || groupadd -r -g 16 oprofile +getent passwd oprofile >/dev/null || \ +useradd -g oprofile -d /var/lib/oprofile -M -r -u 16 -s /sbin/nologin \ + -c "Special user account to be used by OProfile" oprofile +exit 0 + +%postun +# do not try to remove existing oprofile user or group + +%files +%defattr(-,root,root) +%doc docs.installed/* +%doc COPYING + +%{_bindir}/ocount +%{_bindir}/ophelp +%{_bindir}/opimport +%{_bindir}/opannotate +%{_bindir}/opcontrol +%{_bindir}/opgprof +%{_bindir}/opreport +%{_bindir}/oprofiled +%{_bindir}/oparchive +%{_bindir}/opjitconv +%{_bindir}/op-check-perfevents +%{_bindir}/operf + +%{_mandir}/man1/* + +%{_datadir}/oprofile + +%files devel +%defattr(-,root,root) + +%{_includedir}/opagent.h + +%files gui +%defattr(-,root,root) + +%{_bindir}/oprof_start + +%post jit -p /sbin/ldconfig + +%postun jit -p /sbin/ldconfig + +%files jit +%defattr(-,root,root) + +%{_libdir}/oprofile +%{_sysconfdir}/ld.so.conf.d/* + +%changelog +* Thu Jun 22 2017 William Cohen - 0.9.9-23 +- Add power9 support. + +* Tue Mar 21 2017 William Cohen - 0.9.9-22 +- Update ppc64/ppc64le support. rhbz1385007 +- Add recognition check for POWER8NV and POWER8NVL. rhbz1426426 + +* Wed Oct 19 2016 William Cohen - 0.9.9-21 +- Fix Intel Goldmont default event + +* Tue Aug 9 2016 William Cohen - 0.9.9-20 +- Ensure that the perf events setup before ocount execs child. + +* Mon Aug 8 2016 William Cohen - 0.9.9-19 +- Allow operation /proc/sys/kernel/perf_event_paranoid == 2. + +* Wed Jul 6 2016 William Cohen - 0.9.9-18 +- Store profiling data with oparchive. + +* Thu May 12 2016 William Cohen - 0.9.9-17 +- Define some Intel broadwell default unit masks by names +- Add support for Harrisonville (Denverton SoC) +- Add support for Skylake-SP server +- Add support for Kabylake-U/Y +- Add support for Kabylake-H/S +- Make Nehalem, Westmere, and Haswell event names unique. + +* Tue Aug 25 2015 William Cohen - 0.9.9-16 +- Improved handling of remapped anonymous regions +- Correct XML generation. + +* Wed Jul 8 2015 William Cohen - 0.9.9-15 +- Add support for Intel skylake processors. + +* Fri Jun 26 2015 William Cohen - 0.9.9-14 +- Recognize Intel Broadwell-DE. + +* Fri Jun 5 2015 William Cohen - 0.9.9-13 +- Further fix to allow operf to record information for Java anon_huges. + +* Fri Jun 5 2015 William Cohen - 0.9.9-12 +- Eliminate some coverity warnings. + +* Tue Apr 7 2015 William Cohen - 0.9.9-11 +- Avoid setting POSIXLY_CORRECT for the children tasks of operf and ocount. +- Fix handling of default unit masks longer than 11 char. +- Fix extra and default unitmasks selection. +- Allow operf to record information for Java anon_huges. + +* Wed Oct 1 2014 Will Cohen - 0.9.9-7 +- Correct identification power8le. rhbz1148525 + +* Wed Sep 17 2014 Will Cohen - 0.9.9-6 +- Update support for Intel Silvermont (Avoton). +- Enable configure for ppc64le. + +* Mon Aug 18 2014 Will Cohen - 0.9.9-5 +- Update Intel Haswell events. +- Add support for Intel Silvermont (Avoton). +- Add support for Intel Broadwell. +- Add support for aarch64. +- Update IBM power8 events. + +* Fri Jan 24 2014 Daniel Mach - 0.9.9-4 +- Mass rebuild 2014-01-24 + +* Fri Dec 27 2013 Daniel Mach - 0.9.9-3 +- Mass rebuild 2013-12-27 + +* Tue Aug 06 2013 Will Cohen - 0.9.9-2 +- rhbz993994 Eliminate versioned doc pages. + +* Mon Jul 29 2013 Will Cohen - 0.9.9-1 +- Rebase on oprofile. +- Trim changelog entries + +* Mon Jul 15 2013 Will Cohen - 0.9.8-10 +- rhbz949028: Man page scan results for oprofile + +* Thu Feb 14 2013 Fedora Release Engineering - 0.9.8-4 +- Rebuilt for https://fedoraproject.org/wiki/Fedora_19_Mass_Rebuild + +* Wed Dec 19 2012 Will Cohen - 0.9.8-3 +- Use buildid support instead of crc checks. rhbz #877187 + +* Mon Oct 15 2012 Will Cohen - 0.9.8-2 +- Cleanup configure. +- Add libpfm-devel to the buildrequires. + +* Tue Sep 04 2012 Will Cohen - 0.9.8-1 +- Rebase on oprofile-0.9.8. + +* Fri Jul 20 2012 Fedora Release Engineering - 0.9.7-5 +- Rebuilt for https://fedoraproject.org/wiki/Fedora_18_Mass_Rebuild + +* Thu Apr 5 2012 Will Cohen - 0.9.7-4 +- Fix autogen.sh to avoid false match. + +* Wed Apr 4 2012 Will Cohen - 0.9.7-3 +- Use correct macros for /etc and /user/share. rhbz #226222 +- Consistently use macros for buildroot. +- Preserve timestamp for installed files. +- Remove the clean section. +- Fix the source location. +- Remove unneeded BuildRequires: binutils-devel +- Remove unneeded depends. +- Correct Buildreq to java-1.7.0-openjdk-devel. +- Fix macro-in-comment and macro-in-changelog +- Remove '.' from Summary lines +- Correct license GPLv2+ and LGPLv2+. +- Do not remove oprofile user or group. + +* Tue Jan 10 2012 Will Cohen - 0.9.7-2 +- Remove duplicate -r option in %%pre useradd Resolves: rhbz #772841 + +* Tue Nov 29 2011 Will Cohen - 0.9.7-1 +- Rebase on oprofile-0.9.7. + +* Tue Jun 07 2011 Will Cohen - 0.9.6-21 +- Correct CVE-2011-1760. Resolves: rhbz #701508 + +* Tue Apr 5 2011 Will Cohen - 0.9.6-20 +- Re-enable xenoprof patch. + +* Thu Mar 31 2011 Will Cohen - 0.9.6-19 +- Provide oprofile-static. + +* Tue Mar 15 2011 Will Cohen - 0.9.6-18 +- Clean up rpmlint complaints. + +* Tue Mar 15 2011 Will Cohen - 0.9.6-17 +- Correct oprofile user information. + +* Thu Mar 10 2011 Will Cohen - 0.9.6-16 +- Remove obsolete configure options. + +* Thu Mar 10 2011 Will Cohen - 0.9.6-15 +- Use QT4. + +* Fri Feb 25 2011 Will Cohen - 0.9.6-14 +- Add processors models for Intel westmere and core i7. + +* Wed Feb 09 2011 Will Cohen - 0.9.6-12 +- Eliminate illegal mutable use. + +* Tue Feb 08 2011 Fedora Release Engineering - 0.9.6-11 +- Rebuilt for https://fedoraproject.org/wiki/Fedora_15_Mass_Rebuild + +* Thu Jan 6 2011 Will Cohen - 0.9.6-10 +- Corrections for i386/arch_perfmon filters. +- Make nehalem events available. +- Add AMD family 12/14/15h support. +- Add Intel westemere support. +- opcontrol numeric argument checking. + +* Wed Apr 21 2010 Will Cohen - 0.9.6-6 +- Bump version and rebuild. + +* Wed Apr 14 2010 Will Cohen - 0.9.6-5 +- Handle debuginfo section differences. rhbz554639 + +* Mon Apr 5 2010 Will Cohen - 0.9.6-3 +- Include Buildrequires for binutils-static. + +* Fri Dec 11 2009 Will Cohen - 0.9.6-2 +- Clean up oprofile.spec file. + +* Tue Nov 24 2009 Will Cohen - 0.9.6-1 +- Rebase on OProfile 0.9.6. + +* Wed Oct 21 2009 Will Cohen - 0.9.5-4 +- Switch to using ExcludeArch. + +* Wed Oct 7 2009 Will Cohen - 0.9.5-3 +- Allow timer mode to work. +- Correct location for addditional files in man pages. Resolves: rhbz #508669 + +* Fri Sep 4 2009 Will Cohen - 0.9.5-2 +- Bump version and rebuild. + +* Mon Aug 3 2009 Will Cohen - 0.9.5-1 +- Rebase on OProfile 0.9.5. + +* Sat Jul 25 2009 Fedora Release Engineering - 0.9.4-13 +- Rebuilt for https://fedoraproject.org/wiki/Fedora_12_Mass_Rebuild + +* Thu Jul 16 2009 Will Cohen - 0.9.4-12 +- Add shadow-utils to requires. Resolves: rhbz #501357 +- Add LGPL license to provided java support. Resolves: rhbz #474666 +- Correct handling of --verbose. Resolves: rhbz #454969 + +* Mon May 11 2009 Will Cohen - 0.9.4-9 +- Assign specific UID and GID to oprofile. + +* Thu Apr 23 2009 Will Cohen - 0.9.4-7 +- Backport Intel Architecture Perfmon support. Resolves: rhbz #497230 + +* Wed Apr 8 2009 Will Cohen - 0.9.4-6 +- Test for basename declaration. + +* Wed Apr 8 2009 Will Cohen - 0.9.4-5 +- Bump version and rebuild. + +* Thu Feb 26 2009 Fedora Release Engineering - 0.9.4-4 +- Rebuilt for https://fedoraproject.org/wiki/Fedora_11_Mass_Rebuild + +* Mon Sep 29 2008 Dennis Gilmore - 0.9.4-3 +- build sparcv9 not sparc + +* Mon Jul 21 2008 Will Cohen - 0.9.4-2 +- Correct oprofile.spec. + +* Fri Jul 18 2008 Will Cohen - 0.9.4-1 +- Update to orprofile 0.9.4. + +* Mon Jun 23 2008 Will Cohen - 0.9.3-18 +- Fix default location for vmlinux. rhbz #451539 + +* Fri Apr 04 2008 Will Cohen - 0.9.3-17 +- Use older qt3-devel. rhbz #440949 + +* Fri Feb 15 2008 Will Cohen - 0.9.3-16 +- Corrections for compilation with gcc-4.3. + +* Fri Jan 18 2008 Will Cohen - 0.9.3-15 +- Deal with xenoprof conlficts with cell. Resolves: rhbz #250852 + +* Fri Jan 18 2008 Will Cohen - 0.9.3-14 +- Bump format version. Check version properly. Resolves: rhbz #394571 + +* Fri Jan 18 2008 Will Cohen - 0.9.3-13 +- Disable profiling in hypervisor on 970MP to prevent lost interrupts. + Resolves: rhbz #391251 + +* Fri Jan 18 2008 Will Cohen - 0.9.3-12 +- Use more incluse set of kernel ranges. Resolves: rhbz #307111 + +* Fri Jan 18 2008 Will Cohen - 0.9.3-11 +- Update AMD family 10h events to match AMD documentation Resolves: rhbz #232956 + +* Mon Nov 12 2007 Will Cohen - 0.9.3-7 +- Should correct missing 'test' in patch. + +* Mon Oct 8 2007 Will Cohen - 0.9.3-5 +- Should be popt-devel to BuildRequires. + +* Mon Oct 8 2007 Will Cohen - 0.9.3-5 +- Add popt to BuildRequires. + +* Mon Oct 8 2007 Will Cohen - 0.9.3-4 +- Allow short forms of --list-events (-l) and --dump (-d). + Resolves: rhbz#234003. + +* Tue Aug 21 2007 Will Cohen - 0.9.3-3 +- rebuild + +* Wed Jul 25 2007 Will Cohen - 0.9.3-2 +- Re-enable xen patch. + +* Tue Jul 17 2007 Will Cohen - 0.9.3-1 +- Rebase on 0.9.3 release. +- Disable xen patch until fixed. + +* Mon May 21 2007 Will Cohen - 0.9.2-9 +- Fix up rpmlint complaints. + +* Wed Mar 21 2007 Will Cohen - 0.9.2-8 +- Add AMD family 10 support. Resolves: rhbz#232956. + +* Wed Mar 21 2007 Will Cohen - 0.9.2-7 +- Correct description for package. +- Correct backtrace documentation. Resolves: rhbz#214793. +- Correct race condition. Resolves: rhbz#220116. + + +* Fri Nov 3 2006 Will Cohen - 0.9.2-3 +- Add dist tag to build. + +* Fri Sep 22 2006 Will Cohen - 0.9.2-2 +- Rebase on 0.9.2 release. + +* Thu Aug 24 2006 Will Cohen +- Update xenoprof patch. + +* Wed Jul 19 2006 Jesse Keating - 0.9.1-15 +- rebuild +- remove silly release definition + +* Wed Jul 12 2006 Will Cohen +- Support for Intel Woodcrest. (#183081) + +* Wed Jul 12 2006 Jesse Keating - 0.9.1-13.1.1.1 +- rebuild + +* Mon Jul 10 2006 Will Cohen +- Add power6 support. (#196505) + +* Fri Jul 7 2006 Will Cohen +- Support for power5+. (#197728) +- Fix PPC64 events and groups. (#197895) + +* Wed Jun 07 2006 Will Cohen +- Put oprof_start in to oprofile-gui. + +* Wed Jun 07 2006 Will Cohen - 0.9.1-10.1.1 +- Bump version and rebuild. + +* Sat May 13 2006 Will Cohen - 0.9.1-9.1.1 +- Add xenoprof patch. + +* Fri Feb 10 2006 Jesse Keating - 0.9.1-8.1.1 +- bump again for double-long bug on ppc(64) + +* Fri Feb 10 2006 Will Cohen +- Complete path for which and dirname in opcontrol. + +* Tue Feb 07 2006 Jesse Keating - 0.9.1-7.1 +- rebuilt for new gcc4.1 snapshot and glibc changes + +* Thu Dec 22 2005 Jesse Keating +- rebuilt + +* Mon Dec 05 2005 Will Cohen +- Correct anon namespace issue. + +* Fri Nov 11 2005 Will Cohen +- Add alpha and sparcs to exclusivearch. + +* Tue Jul 26 2005 Will Cohen +- Rebase on OProfile 0.9.1. +- Add MIPS 24K files to manifest. + +* Wed Jun 08 2005 Will Cohen +- Rebase on OProfile 0.9. + +* Wed Apr 13 2005 Will Cohen +- Add which dependency. + +* Tue Apr 05 2005 Will Cohen +- Backport ppc64 patch for synthesizing dotted symbols. + +* Mon Mar 21 2005 Will Cohen +- Bump release. +- Rebase on 0.8.2 release. + +* Mon Mar 14 2005 Will Cohen +- Bump rebuild with gcc4. + +* Wed Feb 9 2005 Will Cohen +- Do not need -D_FORTIFY_SOURCE=2 + +* Wed Feb 9 2005 Will Cohen +- Rebuild for -D_FORTIFY_SOURCE=2 + +* Fri Oct 15 2004 Will Cohen +- Additional ppc64 support for ppc64/970. + +* Thu Oct 7 2004 Will Cohen +- Correct opcontrol check for Power 4/5. + +* Fri Oct 1 2004 Will Cohen +- Add support for Power 4/5 performance monitoring hardware. + +* Wed Sep 22 2004 Will Cohen +- Add logic to use preferred symbol names. + +* Wed Sep 15 2004 Will Cohen +- Clean up file manifests. + +* Mon Sep 13 2004 Will Cohen +- Rebase on 0.8.1 release. + +* Wed Jul 7 2004 Will Cohen +- Add oparchive patch. + +* Mon Jun 21 2004 Will Cohen +- bump version + +* Tue Jun 15 2004 Elliot Lee +- rebuilt + +* Thu May 20 2004 Will Cohen +- Eliminate AUTOMAKE and ACLOCAL definitions. +- Correct QTDIR and add oprof_start to file manifests. + +* Tue May 11 2004 Will Cohen +- Remove wildcards in the file manifests. +- Correct build directory. +- Use the 0.8 release tarball. + +* Tue Mar 23 2004 Will Cohen +- Bump version and rebuild. + +* Mon Mar 15 2004 Will Cohen +- Correct cvs checkin. + +* Thu Feb 19 2004 Will Cohen +- Use automake 1.6. + +* Wed Jan 21 2004 Will Cohen +- Rebase on 8.0 cvs snapshot. + +* Mon Dec 01 2003 Will Cohen +- Turn on debug info patch. + +* Mon Nov 24 2003 Will Cohen +- Rebase on 7.1 cvs snapshot. + +* Fri Sep 26 2003 Will Cohen +- Reenable separatedebug and filepos patch. + +* Thu Sep 4 2003 Will Cohen +- Limit to i386. +- Everything but x86_64. +- Turn on x86_64. + +* Mon Aug 11 2003 Will Cohen +- Add gtk2-devel to build requirements. + +* Thu Aug 07 2003 Will Cohen +- adapt to 0.7cvs. + +* Wed Jul 30 2003 Will Cohen +- handle sample files names with spaces. +- clean spec file. +- revise opcontrol --reset. + +* Fri Jul 25 2003 Will Cohen +- Restrict PATH in opcontrol. + +* Wed Jul 09 2003 Will Cohen +- Patch for testing code coverage. +- Better handling of 2.5 module information. + +* Fri Jun 27 2003 Will Cohen +- move to oprofile 0.5.4 pristine tarball. + +* Fri Jun 13 2003 Will Cohen +- Bitmask check. + +* Wed Jun 11 2003 Will Cohen +- Update AMD events. + +* Fri Jun 06 2003 Will Cohen +- Build for ppc64. + +* Thu Jun 05 2003 Will Cohen +- put in s390. +- Fix includes for asserts. +- Make sure elinks is available for html to txt conversion. + +* Fri May 23 2003 Will Cohen +- Avoid library name collisions. + +* Thu May 22 2003 Will Cohen +- Turn on ppc build. +- Turn off ppc build. +- Package op_list.h. + +* Mon May 19 2003 Will Cohen +- Correct typo. + +* Thu Apr 24 2003 Will Cohen +- check min event counts. +- revised op_to_source output to avoid changing line count. +- p4event events revised. +- hammer events revised. + +* Wed Apr 23 2003 Will Cohen +- re-enable ppc build. + +* Wed Apr 16 2003 Will Cohen +- Use /proc/ksym for module information. +- Correct separate debuginfo handling. +- Configure with --enable-abi. + +* Tue Apr 1 2003 Will Cohen +- Correct path finding for daemon and op_help. + +* Mon Mar 31 2003 Will Cohen +- Fix name collisons with /usr/lib/libdb.a. + +* Fri Mar 28 2003 Will Cohen +- clean up spec file. +- turn off ppc build. + +* Mon Mar 24 2003 Will Cohen +- getc instead of fgetc to improve performance. + +* Thu Mar 20 2003 Will Cohen +- produce oprofile-devel. + +* Thu Mar 13 2003 Will Cohen +- fix opvisualise patch format. + +* Wed Mar 12 2003 Will Cohen +- add cmoller changes to fix warnings in opvisualise. + +* Tue Mar 11 2003 Will Cohen +- setup to build on ppc. +- turn on op_visualise for ia64. +- remove unused patches. + +* Mon Mar 10 2003 Will Cohen +- re-enable op_visualise. + +* Fri Mar 7 2003 Will Cohen +- move to oprofile 0.5.1 pristine tarball. +- change libdb abi. + +* Fri Feb 14 2003 Will Cohen +- Requires binutils not perl. + +* Thu Feb 13 2003 Will Cohen +- correct x86_64 sys_lookup_dcookie. +- correct applications of patches. + +* Mon Feb 10 2003 Will Cohen +- rebuilt. +- handle stale locks +- opcontrol rtc patch +- update manpage info + +* Fri Feb 7 2003 Will Cohen +- turn on build for ppc64 +- change order op_visualise searches lib directories. +- revise oprofile-0.4-deprecate patch. +- utils/oprofile kernel range check, --save, and do_dump corrections. +- update gui to use "--separate=library". + +* Thu Feb 6 2003 Will Cohen +- Fix dumping. + +* Fri Jan 31 2003 Will Cohen +- Syscall value for x86_64. +- Update manpage and documentation. +- Revise utils/* to deprecate old. +- Include CPU_P4_HT2 in op_help.c +- Revise how CPU_TIMER_INT handled. +- Apply cookie patch for all archs. +- Correct autogen.sh location. + +* Mon Jan 27 2003 Will Cohen +- Add Hammer specific events. + +* Fri Jan 24 2003 Will Cohen +- Hack to get correct syscall for ia64. +- Hack to get get timer interupt data. +- Fix doc/Makefile.am. + +* Wed Jan 22 2003 Will Cohen +- Add patch for separate debug infomation. + +* Wed Jan 22 2003 Tim Powers +- rebuilt + +* Thu Jan 16 2003 Will Cohen +- Add support for P4 HT. + +* Wed Jan 15 2003 Will Cohen +- Add support for x86_64. + +* Tue Jan 07 2003 Will Cohen +- Revise op_visualise patch to check opendir() results. + +* Mon Jan 06 2003 Will Cohen +- Patch to fix op_visualise seg fault on startup. + +* Thu Jan 02 2003 Will Cohen +- Correct argument type in daemon/oprofiled.c. +- Correct QTDIR. + +* Wed Dec 18 2002 Will Cohen +- Correct reporting of interrupts in oprof_start. + +* Wed Dec 18 2002 Will Cohen +- Rebuilt against new kernel + +* Fri Dec 13 2002 Will Cohen +- Use opcontrol in oprof_start. + +* Thu Dec 12 2002 Will Cohen +- Correct opvisualise problem. + +* Tue Dec 10 2002 Will Cohen +- Add opcontrol, op_dump, op_visualise, ia64 support, + and debugging information. + +* Fri Dec 06 2002 Will Cohen +- Change to use OProfile 0.4 release and kernel support. + +* Sat Nov 30 2002 Tim Powers 0.3-0.20021108.1 +- rebuild against current version of libbfd + +* Tue Aug 06 2002 Will Cohen +- Change to avoid assumption on executable name + +* Fri Aug 02 2002 Will Cohen +- Move to 0.4cvs sources. + +* Mon Jul 29 2002 Will Cohen +- localize nr_counter code +- add ia64 arch +- guess path to vmlinux. + +* Sun Jul 28 2002 Will Cohen +- adjust structure to fit ia64 oprofile module. + +* Thu Jul 25 2002 Will Cohen +- recognize ia64 cpu and events. + +* Tue Jul 23 2002 Will Cohen +- changes to turn of warning as error on ia64. + +* Tue Jul 23 2002 Will Cohen +- changes to allow compilation on ia64. + +* Mon Jul 22 2002 Will Cohen +- pick better Red Hat Linux default image file in /boot. + +* Sun Jul 14 2002 Will Cohen +- use older OProfile 0.2 kernel<->daemon API. + +* Thu Jul 11 2002 Will Cohen +- avoid oprof_start installing the oprofile module + +* Tue Jul 02 2002 Will Cohen +- avoid building and installing the oprofile module + +* Tue May 28 2002 Jeff Johnson +- create package.