From 5dda5cc9dfc60f051dbabf9a9d1c28f26c1695d7 Mon Sep 17 00:00:00 2001 From: Honggang Li Date: Jul 16 2021 01:38:30 +0000 Subject: Update to 4.1.1 Enable psm2 support fbtl-posix: link to common_ompio Revert upstream commit c36d7459b6331c4d Resolve: rhbz#1869443 Signed-off-by: Honggang Li --- diff --git a/.gitignore b/.gitignore index 104cfac..5746d2b 100644 --- a/.gitignore +++ b/.gitignore @@ -54,3 +54,4 @@ openmpi-1.4.1-RH.tar.bz2 /openmpi-4.0.4.tar.bz2 /openmpi-4.0.5.tar.bz2 /openmpi-4.1.0.tar.bz2 +/openmpi-4.1.1.tar.bz2 diff --git a/0001-Revert-ucx-check-supported-transports-and-devices-fo.patch b/0001-Revert-ucx-check-supported-transports-and-devices-fo.patch new file mode 100644 index 0000000..2871232 --- /dev/null +++ b/0001-Revert-ucx-check-supported-transports-and-devices-fo.patch @@ -0,0 +1,367 @@ +From 63c80c7692e55f634cbca6f67cc5c9cdef3a04d2 Mon Sep 17 00:00:00 2001 +From: Honggang Li +Date: Mon, 28 Jun 2021 21:38:13 +0800 +Subject: [PATCH] Revert "ucx: check supported transports and devices for + setting priority" + +This reverts commit c36d7459b6331c4da825cad5a64326e7c1a272aa. +--- + contrib/platform/mellanox/optimized.conf | 2 - + ompi/mca/pml/ucx/pml_ucx_component.c | 15 +- + opal/mca/common/ucx/common_ucx.c | 202 +---------------------- + opal/mca/common/ucx/common_ucx.h | 15 -- + opal/mca/common/ucx/configure.m4 | 2 - + 5 files changed, 2 insertions(+), 234 deletions(-) + +diff --git a/contrib/platform/mellanox/optimized.conf b/contrib/platform/mellanox/optimized.conf +index 543fd8d1e224..b86b37c9e2fa 100644 +--- a/contrib/platform/mellanox/optimized.conf ++++ b/contrib/platform/mellanox/optimized.conf +@@ -61,8 +61,6 @@ + coll = ^ml + hwloc_base_binding_policy = core + btl = self +-pml_ucx_tls = any +-pml_ucx_devices = any + # Basic behavior to smooth startup + mca_base_component_show_load_errors = 0 + orte_abort_timeout = 10 +diff --git a/ompi/mca/pml/ucx/pml_ucx_component.c b/ompi/mca/pml/ucx/pml_ucx_component.c +index 6aed6c41d11d..ed9cc6573e8e 100644 +--- a/ompi/mca/pml/ucx/pml_ucx_component.c ++++ b/ompi/mca/pml/ucx/pml_ucx_component.c +@@ -107,26 +107,13 @@ static mca_pml_base_module_t* + mca_pml_ucx_component_init(int* priority, bool enable_progress_threads, + bool enable_mpi_threads) + { +- opal_common_ucx_support_level_t support_level; + int ret; + +- support_level = opal_common_ucx_support_level(ompi_pml_ucx.ucp_context); +- if (support_level == OPAL_COMMON_UCX_SUPPORT_NONE) { +- return NULL; +- } +- + if ( (ret = mca_pml_ucx_init(enable_mpi_threads)) != 0) { + return NULL; + } + +- /* +- * If found supported devices - set to the configured (high) priority. +- * Otherwise - Found only supported transports (which could be exposed by +- * unsupported devices), so set a priority lower than ob1. +- */ +- *priority = (support_level == OPAL_COMMON_UCX_SUPPORT_DEVICE) ? +- ompi_pml_ucx.priority : 19; +- PML_UCX_VERBOSE(2, "returning priority %d", *priority); ++ *priority = ompi_pml_ucx.priority; + return &ompi_pml_ucx.super; + } + +diff --git a/opal/mca/common/ucx/common_ucx.c b/opal/mca/common/ucx/common_ucx.c +index ac7a17d799a5..ae8e66877ab6 100644 +--- a/opal/mca/common/ucx/common_ucx.c ++++ b/opal/mca/common/ucx/common_ucx.c +@@ -14,11 +14,8 @@ + #include "opal/mca/base/mca_base_framework.h" + #include "opal/mca/pmix/pmix.h" + #include "opal/memoryhooks/memory.h" +-#include "opal/util/argv.h" + + #include +-#include +-#include + + /***********************************************************************/ + +@@ -28,8 +25,7 @@ opal_common_ucx_module_t opal_common_ucx = { + .verbose = 0, + .progress_iterations = 100, + .registered = 0, +- .opal_mem_hooks = 0, +- .tls = NULL ++ .opal_mem_hooks = 0 + }; + + static void opal_common_ucx_mem_release_cb(void *buf, size_t length, +@@ -40,15 +36,10 @@ static void opal_common_ucx_mem_release_cb(void *buf, size_t length, + + OPAL_DECLSPEC void opal_common_ucx_mca_var_register(const mca_base_component_t *component) + { +- static const char *default_tls = "rc_verbs,ud_verbs,rc_mlx5,dc_mlx5,cuda_ipc,rocm_ipc"; +- static const char *default_devices = "mlx*"; + static int registered = 0; + static int hook_index; + static int verbose_index; + static int progress_index; +- static int tls_index; +- static int devices_index; +- + if (!registered) { + verbose_index = mca_base_var_register("opal", "opal_common", "ucx", "verbose", + "Verbose level of the UCX components", +@@ -69,29 +60,6 @@ OPAL_DECLSPEC void opal_common_ucx_mca_var_register(const mca_base_component_t * + OPAL_INFO_LVL_3, + MCA_BASE_VAR_SCOPE_LOCAL, + &opal_common_ucx.opal_mem_hooks); +- +- opal_common_ucx.tls = malloc(sizeof(*opal_common_ucx.tls)); +- *opal_common_ucx.tls = strdup(default_tls); +- tls_index = mca_base_var_register("opal", "opal_common", "ucx", "tls", +- "List of UCX transports which should be supported on the system, to enable " +- "selecting the UCX component. Special values: any (any available). " +- "A '^' prefix negates the list. " +- "For example, in order to exclude on shared memory and TCP transports, " +- "please set to '^posix,sysv,self,tcp,cma,knem,xpmem'.", +- MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0, +- OPAL_INFO_LVL_3, +- MCA_BASE_VAR_SCOPE_LOCAL, +- opal_common_ucx.tls); +- +- opal_common_ucx.devices = malloc(sizeof(*opal_common_ucx.devices)); +- *opal_common_ucx.devices = strdup(default_devices); +- devices_index = mca_base_var_register("opal", "opal_common", "ucx", "devices", +- "List of device driver pattern names, which, if supported by UCX, will " +- "bump its priority above ob1. Special values: any (any available)", +- MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0, +- OPAL_INFO_LVL_3, +- MCA_BASE_VAR_SCOPE_LOCAL, +- opal_common_ucx.devices); + registered = 1; + } + if (component) { +@@ -107,14 +75,6 @@ OPAL_DECLSPEC void opal_common_ucx_mca_var_register(const mca_base_component_t * + component->mca_type_name, + component->mca_component_name, + "opal_mem_hooks", 0); +- mca_base_var_register_synonym(tls_index, component->mca_project_name, +- component->mca_type_name, +- component->mca_component_name, +- "tls", 0); +- mca_base_var_register_synonym(devices_index, component->mca_project_name, +- component->mca_type_name, +- component->mca_component_name, +- "devices", 0); + } + } + +@@ -163,166 +123,6 @@ OPAL_DECLSPEC void opal_common_ucx_mca_deregister(void) + opal_output_close(opal_common_ucx.output); + } + +-#if HAVE_DECL_OPEN_MEMSTREAM +-static bool opal_common_ucx_check_device(const char *device_name, char **device_list) +-{ +- char sysfs_driver_link[PATH_MAX]; +- char driver_path[PATH_MAX]; +- char *ib_device_name; +- char *driver_name; +- char **list_item; +- ssize_t ret; +- +- /* mlx5_0:1 */ +- ret = sscanf(device_name, "%m[^:]%*d", &ib_device_name); +- if (ret != 1) { +- return false; +- } +- +- sysfs_driver_link[sizeof(sysfs_driver_link) - 1] = '\0'; +- snprintf(sysfs_driver_link, sizeof(sysfs_driver_link) - 1, +- "/sys/class/infiniband/%s/device/driver", ib_device_name); +- free(ib_device_name); +- +- driver_path[sizeof(driver_path) - 1] = '\0'; +- ret = readlink(sysfs_driver_link, driver_path, sizeof(driver_path) - 1); +- if (ret < 0) { +- MCA_COMMON_UCX_VERBOSE(2, "readlink(%s) failed: %s", sysfs_driver_link, +- strerror(errno)); +- return false; +- } +- +- driver_name = basename(driver_path); +- for (list_item = device_list; *list_item != NULL; ++list_item) { +- if (!fnmatch(*list_item, driver_name, 0)) { +- MCA_COMMON_UCX_VERBOSE(2, "driver '%s' matched by '%s'", +- driver_path, *list_item); +- return true; +- } +- } +- +- return false; +-} +-#endif +- +-OPAL_DECLSPEC opal_common_ucx_support_level_t +-opal_common_ucx_support_level(ucp_context_h context) +-{ +- opal_common_ucx_support_level_t support_level = OPAL_COMMON_UCX_SUPPORT_NONE; +- static const char *support_level_names[] = { +- [OPAL_COMMON_UCX_SUPPORT_NONE] = "none", +- [OPAL_COMMON_UCX_SUPPORT_TRANSPORT] = "transports only", +- [OPAL_COMMON_UCX_SUPPORT_DEVICE] = "transports and devices" +- }; +-#if HAVE_DECL_OPEN_MEMSTREAM +- char *rsc_tl_name, *rsc_device_name; +- char **tl_list, **device_list, **list_item; +- bool is_any_tl, is_any_device; +- bool found_tl, negate; +- char line[128]; +- FILE *stream; +- char *buffer; +- size_t size; +- int ret; +-#endif +- +- is_any_tl = !strcmp(*opal_common_ucx.tls, "any"); +- is_any_device = !strcmp(*opal_common_ucx.devices, "any"); +- +- /* Check for special value "any" */ +- if (is_any_tl && is_any_device) { +- MCA_COMMON_UCX_VERBOSE(1, "ucx is enabled on any transport or device", +- *opal_common_ucx.tls); +- support_level = OPAL_COMMON_UCX_SUPPORT_DEVICE; +- goto out; +- } +- +-#if HAVE_DECL_OPEN_MEMSTREAM +- /* Split transports list */ +- negate = ('^' == (*opal_common_ucx.tls)[0]); +- tl_list = opal_argv_split(*opal_common_ucx.tls + (negate ? 1 : 0), ','); +- if (tl_list == NULL) { +- MCA_COMMON_UCX_VERBOSE(1, "failed to split tl list '%s', ucx is disabled", +- *opal_common_ucx.tls); +- goto out; +- } +- +- /* Split devices list */ +- device_list = opal_argv_split(*opal_common_ucx.devices, ','); +- if (device_list == NULL) { +- MCA_COMMON_UCX_VERBOSE(1, "failed to split devices list '%s', ucx is disabled", +- *opal_common_ucx.devices); +- goto out_free_tl_list; +- } +- +- /* Open memory stream to dump UCX information to */ +- stream = open_memstream(&buffer, &size); +- if (stream == NULL) { +- MCA_COMMON_UCX_VERBOSE(1, "failed to open memory stream for ucx info (%s), " +- "ucx is disabled", strerror(errno)); +- goto out_free_device_list; +- } +- +- /* Print ucx transports information to the memory stream */ +- ucp_context_print_info(context, stream); +- +- /* Rewind and read transports/devices list from the stream */ +- fseek(stream, 0, SEEK_SET); +- while ((support_level != OPAL_COMMON_UCX_SUPPORT_DEVICE) && +- (fgets(line, sizeof(line), stream) != NULL)) { +- rsc_tl_name = NULL; +- ret = sscanf(line, +- /* "# resource 6 : md 5 dev 4 flags -- rc_verbs/mlx5_0:1" */ +- "# resource %*d : md %*d dev %*d flags -- %m[^/ \n\r]/%m[^/ \n\r]", +- &rsc_tl_name, &rsc_device_name); +- if (ret != 2) { +- free(rsc_tl_name); +- continue; +- } +- +- /* Check if 'rsc_tl_name' is found provided list */ +- found_tl = is_any_tl; +- for (list_item = tl_list; !found_tl && (*list_item != NULL); ++list_item) { +- found_tl = !strcmp(*list_item, rsc_tl_name); +- } +- +- /* Check if the transport has a match (either positive or negative) */ +- assert(!(is_any_tl && negate)); +- if (found_tl != negate) { +- if (is_any_device || +- opal_common_ucx_check_device(rsc_device_name, device_list)) { +- MCA_COMMON_UCX_VERBOSE(2, "%s/%s: matched both transport and device list", +- rsc_tl_name, rsc_device_name); +- support_level = OPAL_COMMON_UCX_SUPPORT_DEVICE; +- } else { +- MCA_COMMON_UCX_VERBOSE(2, "%s/%s: matched transport list but not device list", +- rsc_tl_name, rsc_device_name); +- support_level = OPAL_COMMON_UCX_SUPPORT_TRANSPORT; +- } +- } else { +- MCA_COMMON_UCX_VERBOSE(2, "%s/%s: did not match transport list", +- rsc_tl_name, rsc_device_name); +- } +- +- free(rsc_device_name); +- free(rsc_tl_name); +- } +- +- MCA_COMMON_UCX_VERBOSE(2, "support level is %s", support_level_names[support_level]); +- fclose(stream); +- free(buffer); +- +-out_free_device_list: +- opal_argv_free(device_list); +-out_free_tl_list: +- opal_argv_free(tl_list); +-out: +-#else +- MCA_COMMON_UCX_VERBOSE(2, "open_memstream() was not found, ucx is disabled"); +-#endif +- return support_level; +-} +- + void opal_common_ucx_empty_complete_cb(void *request, ucs_status_t status) + { + } +diff --git a/opal/mca/common/ucx/common_ucx.h b/opal/mca/common/ucx/common_ucx.h +index 92cdd738ef98..202131ac8907 100644 +--- a/opal/mca/common/ucx/common_ucx.h ++++ b/opal/mca/common/ucx/common_ucx.h +@@ -88,8 +88,6 @@ typedef struct opal_common_ucx_module { + int progress_iterations; + int registered; + bool opal_mem_hooks; +- char **tls; +- char **devices; + } opal_common_ucx_module_t; + + typedef struct opal_common_ucx_del_proc { +@@ -97,23 +95,10 @@ typedef struct opal_common_ucx_del_proc { + size_t vpid; + } opal_common_ucx_del_proc_t; + +-typedef enum { +- /* No supported transports found (according to configured list of supported +- transports) */ +- OPAL_COMMON_UCX_SUPPORT_NONE, +- +- /* Have supported transports but not supported devices */ +- OPAL_COMMON_UCX_SUPPORT_TRANSPORT, +- +- /* Have both supported transports and supported devices */ +- OPAL_COMMON_UCX_SUPPORT_DEVICE, +-} opal_common_ucx_support_level_t; +- + extern opal_common_ucx_module_t opal_common_ucx; + + OPAL_DECLSPEC void opal_common_ucx_mca_register(void); + OPAL_DECLSPEC void opal_common_ucx_mca_deregister(void); +-OPAL_DECLSPEC opal_common_ucx_support_level_t opal_common_ucx_support_level(ucp_context_h context); + OPAL_DECLSPEC void opal_common_ucx_mca_proc_added(void); + OPAL_DECLSPEC void opal_common_ucx_empty_complete_cb(void *request, ucs_status_t status); + OPAL_DECLSPEC int opal_common_ucx_mca_pmix_fence(ucp_worker_h worker); +diff --git a/opal/mca/common/ucx/configure.m4 b/opal/mca/common/ucx/configure.m4 +index af8628a889c6..27e07c2005b2 100644 +--- a/opal/mca/common/ucx/configure.m4 ++++ b/opal/mca/common/ucx/configure.m4 +@@ -18,8 +18,6 @@ AC_DEFUN([MCA_opal_common_ucx_CONFIG],[ + [common_ucx_happy="yes"], + [common_ucx_happy="no"]) + +- AC_CHECK_DECLS([open_memstream], [], [], [[#include ]]) +- + AS_IF([test "$common_ucx_happy" = "yes"], + [$1], + [$2]) +-- +2.31.1 + diff --git a/266189935aef4fce825d0db831b4b53accc62c32.patch b/266189935aef4fce825d0db831b4b53accc62c32.patch new file mode 100644 index 0000000..ac960e4 --- /dev/null +++ b/266189935aef4fce825d0db831b4b53accc62c32.patch @@ -0,0 +1,33 @@ +From 266189935aef4fce825d0db831b4b53accc62c32 Mon Sep 17 00:00:00 2001 +From: Jeff Squyres +Date: Tue, 22 Jun 2021 22:28:37 -0400 +Subject: [PATCH] fbtl-posix: link to common_ompio + +The posix fbtl calls mca_common_ompio_progress(), which resides in +common/ompio (i.e., libmca_common_ompio.la). So add that into +mca_fbtl_posix_la_LIBADD (like we do in a few other OMPIO-based +components). Failure to do this *can* lead to the posix fbtl +component failing to load (depending on whether other OMPIO-based +components that pull in libmca_common_ompio were loaded first). + +Thanks to Honggang Li for raising the issue. + +Signed-off-by: Jeff Squyres +--- + ompi/mca/fbtl/posix/Makefile.am | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/ompi/mca/fbtl/posix/Makefile.am b/ompi/mca/fbtl/posix/Makefile.am +index a7b0624d3ec..1ce19cb09b7 100644 +--- a/ompi/mca/fbtl/posix/Makefile.am ++++ b/ompi/mca/fbtl/posix/Makefile.am +@@ -34,7 +34,8 @@ mcacomponentdir = $(ompilibdir) + mcacomponent_LTLIBRARIES = $(component_install) + mca_fbtl_posix_la_SOURCES = $(sources) + mca_fbtl_posix_la_LDFLAGS = -module -avoid-version +-mca_fbtl_posix_la_LIBADD = $(top_builddir)/ompi/lib@OMPI_LIBMPI_NAME@.la ++mca_fbtl_posix_la_LIBADD = $(top_builddir)/ompi/lib@OMPI_LIBMPI_NAME@.la \ ++ $(OMPI_TOP_BUILDDIR)/ompi/mca/common/ompio/libmca_common_ompio.la + + noinst_LTLIBRARIES = $(component_noinst) + libmca_fbtl_posix_la_SOURCES = $(sources) diff --git a/8322.patch b/8322.patch deleted file mode 100644 index 8c04e5d..0000000 --- a/8322.patch +++ /dev/null @@ -1,1682 +0,0 @@ -From 31068e063b8795ae11f3a59d4080db1fe111cfaf Mon Sep 17 00:00:00 2001 -From: George Bosilca -Date: Mon, 28 Dec 2020 15:36:05 -0500 -Subject: [PATCH 1/3] Major update to the AVX* detection and support - -1. Consistent march flag order between configure and make. - -2. op/avx: give the option to skip some tests - -it is possible to skip some intrinsic tests by setting some environment variables to "no" before invoking configure: - - ompi_cv_op_avx_check_avx512 - - ompi_cv_op_avx_check_avx2 - - ompi_cv_op_avx_check_avx - - ompi_cv_op_avx_check_sse41 - - ompi_cv_op_avx_check_sse3 - -3. op/avx: update AVX512 flags - -try --mavx512f -mavx512bw -mavx512vl -mavx512dq -instead of --march=skylake-avx512 - -since the former is less likely to conflict with user provided CFLAGS -(e.g. -march=...) - -Thanks Bart Oldeman for pointing this. - -4. op/avx: have the op/avx library depend on libmpi.so - -Refs. open-mpi/ompi#8323 - -Signed-off-by: Gilles Gouaillardet -Signed-off-by: George Bosilca ---- - ompi/mca/op/avx/Makefile.am | 4 +- - ompi/mca/op/avx/configure.m4 | 325 ++++++++++++++++++----------------- - 2 files changed, 174 insertions(+), 155 deletions(-) - -diff --git a/ompi/mca/op/avx/Makefile.am b/ompi/mca/op/avx/Makefile.am -index 41dcf2e1834..b1d84d90b33 100644 ---- a/ompi/mca/op/avx/Makefile.am -+++ b/ompi/mca/op/avx/Makefile.am -@@ -2,7 +2,7 @@ - # Copyright (c) 2019-2020 The University of Tennessee and The University - # of Tennessee Research Foundation. All rights - # reserved. --# Copyright (c) 2020 Research Organization for Information Science -+# Copyright (c) 2020-2021 Research Organization for Information Science - # and Technology (RIST). All rights reserved. - # $COPYRIGHT$ - # -@@ -86,7 +86,7 @@ mcacomponentdir = $(ompilibdir) - mcacomponent_LTLIBRARIES = $(component_install) - mca_op_avx_la_SOURCES = $(sources) - mca_op_avx_la_LIBADD = $(specialized_op_libs) --mca_op_avx_la_LDFLAGS = -module -avoid-version -+mca_op_avx_la_LDFLAGS = -module -avoid-version $(top_builddir)/ompi/lib@OMPI_LIBMPI_NAME@.la - - - # Specific information for static builds. -diff --git a/ompi/mca/op/avx/configure.m4 b/ompi/mca/op/avx/configure.m4 -index 09d8b374c8e..f61b7100ef4 100644 ---- a/ompi/mca/op/avx/configure.m4 -+++ b/ompi/mca/op/avx/configure.m4 -@@ -29,6 +29,13 @@ AC_DEFUN([MCA_ompi_op_avx_CONFIG],[ - op_avx_support=0 - op_avx2_support=0 - op_avx512_support=0 -+ -+ AS_VAR_PUSHDEF([op_avx_check_sse3], [ompi_cv_op_avx_check_sse3]) -+ AS_VAR_PUSHDEF([op_avx_check_sse41], [ompi_cv_op_avx_check_sse41]) -+ AS_VAR_PUSHDEF([op_avx_check_avx], [ompi_cv_op_avx_check_avx]) -+ AS_VAR_PUSHDEF([op_avx_check_avx2], [ompi_cv_op_avx_check_avx2]) -+ AS_VAR_PUSHDEF([op_avx_check_avx512], [ompi_cv_op_avx_check_avx512]) -+ - OPAL_VAR_SCOPE_PUSH([op_avx_cflags_save]) - - AS_IF([test "$opal_cv_asm_arch" = "X86_64"], -@@ -37,21 +44,9 @@ AC_DEFUN([MCA_ompi_op_avx_CONFIG],[ - # - # Check for AVX512 support - # -- AC_MSG_CHECKING([for AVX512 support (no additional flags)]) -- AC_LINK_IFELSE( -- [AC_LANG_PROGRAM([[#include ]], -- [[ -- __m512 vA, vB; -- _mm512_add_ps(vA, vB) -- ]])], -- [op_avx512_support=1 -- AC_MSG_RESULT([yes])], -- [AC_MSG_RESULT([no])]) -- -- AS_IF([test $op_avx512_support -eq 0], -- [AC_MSG_CHECKING([for AVX512 support (with -march=skylake-avx512)]) -- op_avx_cflags_save="$CFLAGS" -- CFLAGS="$CFLAGS -march=skylake-avx512" -+ AC_CACHE_CHECK([if we are checking for AVX512 support], op_avx_check_avx512, AS_VAR_SET(op_avx_check_avx512, yes)) -+ AS_IF([test "$op_avx_check_avx512" = "yes"], -+ [AC_MSG_CHECKING([for AVX512 support (no additional flags)]) - AC_LINK_IFELSE( - [AC_LANG_PROGRAM([[#include ]], - [[ -@@ -59,99 +54,115 @@ AC_DEFUN([MCA_ompi_op_avx_CONFIG],[ - _mm512_add_ps(vA, vB) - ]])], - [op_avx512_support=1 -- MCA_BUILD_OP_AVX512_FLAGS="-march=skylake-avx512" - AC_MSG_RESULT([yes])], - [AC_MSG_RESULT([no])]) -- CFLAGS="$op_avx_cflags_save" -- ]) -- # -- # Some combination of gcc and older as would not correctly build the code generated by -- # _mm256_loadu_si256. Screen them out. -- # -- AS_IF([test $op_avx512_support -eq 1], -- [AC_MSG_CHECKING([if _mm512_loadu_si512 generates code that can be compiled]) -- op_avx_cflags_save="$CFLAGS" -- CFLAGS="$CFLAGS_WITHOUT_OPTFLAGS -O0 $MCA_BUILD_OP_AVX512_FLAGS" -- AC_LINK_IFELSE( -- [AC_LANG_PROGRAM([[#include ]], -- [[ -+ -+ AS_IF([test $op_avx512_support -eq 0], -+ [AC_MSG_CHECKING([for AVX512 support (with -mavx512f -mavx512bw -mavx512vl -mavx512dq)]) -+ op_avx_cflags_save="$CFLAGS" -+ CFLAGS="-mavx512f -mavx512bw -mavx512vl -mavx512dq $CFLAGS" -+ AC_LINK_IFELSE( -+ [AC_LANG_PROGRAM([[#include ]], -+ [[ -+ __m512 vA, vB; -+ _mm512_add_ps(vA, vB) -+ ]])], -+ [op_avx512_support=1 -+ MCA_BUILD_OP_AVX512_FLAGS="-mavx512f -mavx512bw -mavx512vl -mavx512dq" -+ AC_MSG_RESULT([yes])], -+ [AC_MSG_RESULT([no])]) -+ CFLAGS="$op_avx_cflags_save" -+ ]) -+ # -+ # Some combination of gcc and older as would not correctly build the code generated by -+ # _mm256_loadu_si256. Screen them out. -+ # -+ AS_IF([test $op_avx512_support -eq 1], -+ [AC_MSG_CHECKING([if _mm512_loadu_si512 generates code that can be compiled]) -+ op_avx_cflags_save="$CFLAGS" -+ CFLAGS="$CFLAGS_WITHOUT_OPTFLAGS -O0 $MCA_BUILD_OP_AVX512_FLAGS" -+ AC_LINK_IFELSE( -+ [AC_LANG_PROGRAM([[#include ]], -+ [[ - int A[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}; - __m512i vA = _mm512_loadu_si512((__m512i*)&(A[1])) -- ]])], -- [AC_MSG_RESULT([yes])], -- [op_avx512_support=0 -- MCA_BUILD_OP_AVX512_FLAGS="" -- AC_MSG_RESULT([no])]) -- CFLAGS="$op_avx_cflags_save" -- ]) -- # -- # Some PGI compilers do not define _mm512_mullo_epi64. Screen them out. -- # -- AS_IF([test $op_avx512_support -eq 1], -- [AC_MSG_CHECKING([if _mm512_mullo_epi64 generates code that can be compiled]) -- op_avx_cflags_save="$CFLAGS" -- CFLAGS="$CFLAGS_WITHOUT_OPTFLAGS -O0 $MCA_BUILD_OP_AVX512_FLAGS" -- AC_LINK_IFELSE( -- [AC_LANG_PROGRAM([[#include ]], -- [[ -+ ]])], -+ [AC_MSG_RESULT([yes])], -+ [op_avx512_support=0 -+ MCA_BUILD_OP_AVX512_FLAGS="" -+ AC_MSG_RESULT([no])]) -+ CFLAGS="$op_avx_cflags_save" -+ ]) -+ # -+ # Some PGI compilers do not define _mm512_mullo_epi64. Screen them out. -+ # -+ AS_IF([test $op_avx512_support -eq 1], -+ [AC_MSG_CHECKING([if _mm512_mullo_epi64 generates code that can be compiled]) -+ op_avx_cflags_save="$CFLAGS" -+ CFLAGS="$CFLAGS_WITHOUT_OPTFLAGS -O0 $MCA_BUILD_OP_AVX512_FLAGS" -+ AC_LINK_IFELSE( -+ [AC_LANG_PROGRAM([[#include ]], -+ [[ - __m512i vA, vB; - _mm512_mullo_epi64(vA, vB) -- ]])], -- [AC_MSG_RESULT([yes])], -- [op_avx512_support=0 -- MCA_BUILD_OP_AVX512_FLAGS="" -- AC_MSG_RESULT([no])]) -- CFLAGS="$op_avx_cflags_save" -- ]) -+ ]])], -+ [AC_MSG_RESULT([yes])], -+ [op_avx512_support=0 -+ MCA_BUILD_OP_AVX512_FLAGS="" -+ AC_MSG_RESULT([no])]) -+ CFLAGS="$op_avx_cflags_save" -+ ])]) - # - # Check support for AVX2 - # -- AC_MSG_CHECKING([for AVX2 support (no additional flags)]) -- AC_LINK_IFELSE( -- [AC_LANG_PROGRAM([[#include ]], -- [[ -+ AC_CACHE_CHECK([if we are checking for AVX2 support], op_avx_check_avx2, AS_VAR_SET(op_avx_check_avx2, yes)) -+ AS_IF([test "$op_avx_check_avx2" = "yes"], -+ [AC_MSG_CHECKING([for AVX2 support (no additional flags)]) -+ AC_LINK_IFELSE( -+ [AC_LANG_PROGRAM([[#include ]], -+ [[ - __m256 vA, vB; - _mm256_add_ps(vA, vB) -- ]])], -- [op_avx2_support=1 -- AC_MSG_RESULT([yes])], -- [AC_MSG_RESULT([no])]) -- AS_IF([test $op_avx2_support -eq 0], -- [AC_MSG_CHECKING([for AVX2 support (with -mavx2)]) -- op_avx_cflags_save="$CFLAGS" -- CFLAGS="$CFLAGS -mavx2" -- AC_LINK_IFELSE( -- [AC_LANG_PROGRAM([[#include ]], -- [[ -+ ]])], -+ [op_avx2_support=1 -+ AC_MSG_RESULT([yes])], -+ [AC_MSG_RESULT([no])]) -+ AS_IF([test $op_avx2_support -eq 0], -+ [AC_MSG_CHECKING([for AVX2 support (with -mavx2)]) -+ op_avx_cflags_save="$CFLAGS" -+ CFLAGS="-mavx2 $CFLAGS" -+ AC_LINK_IFELSE( -+ [AC_LANG_PROGRAM([[#include ]], -+ [[ - __m256 vA, vB; - _mm256_add_ps(vA, vB) -- ]])], -- [op_avx2_support=1 -- MCA_BUILD_OP_AVX2_FLAGS="-mavx2" -- AC_MSG_RESULT([yes])], -- [AC_MSG_RESULT([no])]) -- CFLAGS="$op_avx_cflags_save" -- ]) -- # -- # Some combination of gcc and older as would not correctly build the code generated by -- # _mm256_loadu_si256. Screen them out. -- # -- AS_IF([test $op_avx2_support -eq 1], -- [AC_MSG_CHECKING([if _mm256_loadu_si256 generates code that can be compiled]) -- op_avx_cflags_save="$CFLAGS" -- CFLAGS="$CFLAGS_WITHOUT_OPTFLAGS -O0 $MCA_BUILD_OP_AVX2_FLAGS" -- AC_LINK_IFELSE( -- [AC_LANG_PROGRAM([[#include ]], -- [[ -+ ]])], -+ [op_avx2_support=1 -+ MCA_BUILD_OP_AVX2_FLAGS="-mavx2" -+ AC_MSG_RESULT([yes])], -+ [AC_MSG_RESULT([no])]) -+ CFLAGS="$op_avx_cflags_save" -+ ]) -+ # -+ # Some combination of gcc and older as would not correctly build the code generated by -+ # _mm256_loadu_si256. Screen them out. -+ # -+ AS_IF([test $op_avx2_support -eq 1], -+ [AC_MSG_CHECKING([if _mm256_loadu_si256 generates code that can be compiled]) -+ op_avx_cflags_save="$CFLAGS" -+ CFLAGS="$CFLAGS_WITHOUT_OPTFLAGS -O0 $MCA_BUILD_OP_AVX2_FLAGS" -+ AC_LINK_IFELSE( -+ [AC_LANG_PROGRAM([[#include ]], -+ [[ - int A[8] = {0, 1, 2, 3, 4, 5, 6, 7}; - __m256i vA = _mm256_loadu_si256((__m256i*)&A) -- ]])], -- [AC_MSG_RESULT([yes])], -- [op_avx2_support=0 -- MCA_BUILD_OP_AVX2_FLAGS="" -- AC_MSG_RESULT([no])]) -- CFLAGS="$op_avx_cflags_save" -- ]) -+ ]])], -+ [AC_MSG_RESULT([yes])], -+ [op_avx2_support=0 -+ MCA_BUILD_OP_AVX2_FLAGS="" -+ AC_MSG_RESULT([no])]) -+ CFLAGS="$op_avx_cflags_save" -+ ])]) - # - # What about early AVX support. The rest of the logic is slightly different as - # we need to include some of the SSE4.1 and SSE3 instructions. So, we first check -@@ -160,90 +171,92 @@ AC_DEFUN([MCA_ompi_op_avx_CONFIG],[ - # the AVX flag, and then recheck if we have support for the SSE4.1 and SSE3 - # instructions. - # -- AC_MSG_CHECKING([for AVX support (no additional flags)]) -- AC_LINK_IFELSE( -- [AC_LANG_PROGRAM([[#include ]], -- [[ -+ AC_CACHE_CHECK([if we are checking for AVX support], op_avx_check_avx, AS_VAR_SET(op_avx_check_avx, yes)) -+ AS_IF([test "$op_avx_check_avx" = "yes"], -+ [AC_MSG_CHECKING([for AVX support (no additional flags)]) -+ AC_LINK_IFELSE( -+ [AC_LANG_PROGRAM([[#include ]], -+ [[ - __m128 vA, vB; - _mm_add_ps(vA, vB) -- ]])], -- [op_avx_support=1 -- AC_MSG_RESULT([yes])], -- [AC_MSG_RESULT([no])]) -+ ]])], -+ [op_avx_support=1 -+ AC_MSG_RESULT([yes])], -+ [AC_MSG_RESULT([no])])]) - # - # Check for SSE4.1 support - # -- AS_IF([test $op_avx_support -eq 1], -- [AC_MSG_CHECKING([for SSE4.1 support]) -- AC_LINK_IFELSE( -- [AC_LANG_PROGRAM([[#include ]], -- [[ -+ AC_CACHE_CHECK([if we are checking for SSE4.1 support], op_avx_check_sse41, AS_VAR_SET(op_avx_check_sse41, yes)) -+ AS_IF([test $op_avx_support -eq 1 && test "$op_avx_check_sse41" = "yes"], -+ [AC_MSG_CHECKING([for SSE4.1 support]) -+ AC_LINK_IFELSE( -+ [AC_LANG_PROGRAM([[#include ]], -+ [[ - __m128i vA, vB; - (void)_mm_max_epi8(vA, vB) -- ]])], -- [op_sse41_support=1 -- AC_MSG_RESULT([yes])], -- [AC_MSG_RESULT([no])]) -- ]) -+ ]])], -+ [op_sse41_support=1 -+ AC_MSG_RESULT([yes])], -+ [AC_MSG_RESULT([no])]) -+ ]) - # - # Check for SSE3 support - # -- AS_IF([test $op_avx_support -eq 1], -- [AC_MSG_CHECKING([for SSE3 support]) -- AC_LINK_IFELSE( -- [AC_LANG_PROGRAM([[#include ]], -- [[ -+ AC_CACHE_CHECK([if we are checking for SSE3 support], op_avx_check_sse3, AS_VAR_SET(op_avx_check_sse3, yes)) -+ AS_IF([test $op_avx_support -eq 1 && test "$op_avx_check_sse3" = "yes"], -+ [AC_MSG_CHECKING([for SSE3 support]) -+ AC_LINK_IFELSE( -+ [AC_LANG_PROGRAM([[#include ]], -+ [[ - int A[4] = {0, 1, 2, 3}; - __m128i vA = _mm_lddqu_si128((__m128i*)&A) -- ]])], -- [op_sse3_support=1 -- AC_MSG_RESULT([yes])], -- [AC_MSG_RESULT([no])]) -- ]) -+ ]])], -+ [op_sse3_support=1 -+ AC_MSG_RESULT([yes])], -+ [AC_MSG_RESULT([no])]) -+ ]) - # Second pass, do we need to add the AVX flag ? - AS_IF([test $op_avx_support -eq 0 || test $op_sse41_support -eq 0 || test $op_sse3_support -eq 0], -- [AC_MSG_CHECKING([for AVX support (with -mavx)]) -- op_avx_cflags_save="$CFLAGS" -- CFLAGS="$CFLAGS -mavx" -- AC_LINK_IFELSE( -- [AC_LANG_PROGRAM([[#include ]], -- [[ -+ [AS_IF([test "$op_avx_check_avx" = "yes"], -+ [AC_MSG_CHECKING([for AVX support (with -mavx)]) -+ op_avx_cflags_save="$CFLAGS" -+ CFLAGS="-mavx $CFLAGS" -+ AC_LINK_IFELSE( -+ [AC_LANG_PROGRAM([[#include ]], -+ [[ - __m128 vA, vB; - _mm_add_ps(vA, vB) - ]])], -- [op_avx_support=1 -- MCA_BUILD_OP_AVX_FLAGS="-mavx" -- op_sse41_support=0 -- op_sse3_support=0 -- AC_MSG_RESULT([yes])], -- [AC_MSG_RESULT([no])]) -+ [op_avx_support=1 -+ MCA_BUILD_OP_AVX_FLAGS="-mavx" -+ op_sse41_support=0 -+ op_sse3_support=0 -+ AC_MSG_RESULT([yes])], -+ [AC_MSG_RESULT([no])])]) - -- AS_IF([test $op_sse41_support -eq 0], -- [AC_MSG_CHECKING([for SSE4.1 support]) -- AC_LINK_IFELSE( -- [AC_LANG_PROGRAM([[#include ]], -- [[ -+ AS_IF([test "$op_avx_check_sse41" = "yes" && test $op_sse41_support -eq 0], -+ [AC_MSG_CHECKING([for SSE4.1 support]) -+ AC_LINK_IFELSE( -+ [AC_LANG_PROGRAM([[#include ]], -+ [[ - __m128i vA, vB; - (void)_mm_max_epi8(vA, vB) -- ]])], -- [op_sse41_support=1 -- AC_MSG_RESULT([yes])], -- [AC_MSG_RESULT([no])]) -- ]) -- AS_IF([test $op_sse3_support -eq 0], -- [AC_MSG_CHECKING([for SSE3 support]) -- AC_LINK_IFELSE( -- [AC_LANG_PROGRAM([[#include ]], -+ ]])], -+ [op_sse41_support=1 -+ AC_MSG_RESULT([yes])], -+ [AC_MSG_RESULT([no])])]) -+ AS_IF([test "$op_avx_check_sse3" = "yes" && test $op_sse3_support -eq 0], -+ [AC_MSG_CHECKING([for SSE3 support]) -+ AC_LINK_IFELSE( -+ [AC_LANG_PROGRAM([[#include ]], - [[ - int A[4] = {0, 1, 2, 3}; - __m128i vA = _mm_lddqu_si128((__m128i*)&A) - ]])], -- [op_sse3_support=1 -- AC_MSG_RESULT([yes])], -- [AC_MSG_RESULT([no])]) -- ]) -- CFLAGS="$op_avx_cflags_save" -- ]) -+ [op_sse3_support=1 -+ AC_MSG_RESULT([yes])], -+ [AC_MSG_RESULT([no])])]) -+ CFLAGS="$op_avx_cflags_save"]) - - AC_LANG_POP([C]) - ]) -@@ -276,6 +289,12 @@ AC_DEFUN([MCA_ompi_op_avx_CONFIG],[ - AC_SUBST(MCA_BUILD_OP_AVX2_FLAGS) - AC_SUBST(MCA_BUILD_OP_AVX_FLAGS) - -+ AS_VAR_POPDEF([op_avx_check_avx512]) -+ AS_VAR_POPDEF([op_avx_check_avx2]) -+ AS_VAR_POPDEF([op_avx_check_avx]) -+ AS_VAR_POPDEF([op_avx_check_sse41]) -+ AS_VAR_POPDEF([op_avx_check_sse3]) -+ - OPAL_VAR_SCOPE_POP - # Enable this component iff we have at least the most basic form of support - # for vectorial ISA - -From fcf2766a03e3c2a1001679013878209bcddd50ae Mon Sep 17 00:00:00 2001 -From: George Bosilca -Date: Mon, 28 Dec 2020 12:18:07 -0500 -Subject: [PATCH 2/3] AVX code generation improvements - -1. Allow fallback to a lesser AVX support during make - -Due to the fact that some distro restrict the compiule architecture -during make (while not setting any restrictions during configure) we -need to detect the target architecture also during make in order to -restrict the code we generate. - -2. Add comments and better protect the arch specific code. - -Identify all the vectorial functions used and clasify them according to -the neccesary hardware capabilities. -Use these requirements to protect the code for load and stores (the rest -of the code being automatically generated it is more difficult to -protect). - -3. Correctly check for AVX* support. - -Signed-off-by: George Bosilca ---- - ompi/mca/op/avx/configure.m4 | 28 +-- - ompi/mca/op/avx/op_avx_functions.c | 322 ++++++++++++++++++++++++----- - 2 files changed, 288 insertions(+), 62 deletions(-) - -diff --git a/ompi/mca/op/avx/configure.m4 b/ompi/mca/op/avx/configure.m4 -index f61b7100ef4..f3651f09d43 100644 ---- a/ompi/mca/op/avx/configure.m4 -+++ b/ompi/mca/op/avx/configure.m4 -@@ -44,7 +44,7 @@ AC_DEFUN([MCA_ompi_op_avx_CONFIG],[ - # - # Check for AVX512 support - # -- AC_CACHE_CHECK([if we are checking for AVX512 support], op_avx_check_avx512, AS_VAR_SET(op_avx_check_avx512, yes)) -+ AC_CACHE_CHECK([for AVX512 support], op_avx_check_avx512, AS_VAR_SET(op_avx_check_avx512, yes)) - AS_IF([test "$op_avx_check_avx512" = "yes"], - [AC_MSG_CHECKING([for AVX512 support (no additional flags)]) - AC_LINK_IFELSE( -@@ -115,14 +115,14 @@ AC_DEFUN([MCA_ompi_op_avx_CONFIG],[ - # - # Check support for AVX2 - # -- AC_CACHE_CHECK([if we are checking for AVX2 support], op_avx_check_avx2, AS_VAR_SET(op_avx_check_avx2, yes)) -+ AC_CACHE_CHECK([for AVX2 support], op_avx_check_avx2, AS_VAR_SET(op_avx_check_avx2, yes)) - AS_IF([test "$op_avx_check_avx2" = "yes"], - [AC_MSG_CHECKING([for AVX2 support (no additional flags)]) - AC_LINK_IFELSE( - [AC_LANG_PROGRAM([[#include ]], - [[ -- __m256 vA, vB; -- _mm256_add_ps(vA, vB) -+ __m256i vA, vB, vC; -+ vC = _mm256_and_si256(vA, vB) - ]])], - [op_avx2_support=1 - AC_MSG_RESULT([yes])], -@@ -134,8 +134,8 @@ AC_DEFUN([MCA_ompi_op_avx_CONFIG],[ - AC_LINK_IFELSE( - [AC_LANG_PROGRAM([[#include ]], - [[ -- __m256 vA, vB; -- _mm256_add_ps(vA, vB) -+ __m256i vA, vB, vC; -+ vC = _mm256_and_si256(vA, vB) - ]])], - [op_avx2_support=1 - MCA_BUILD_OP_AVX2_FLAGS="-mavx2" -@@ -164,21 +164,21 @@ AC_DEFUN([MCA_ompi_op_avx_CONFIG],[ - CFLAGS="$op_avx_cflags_save" - ])]) - # -- # What about early AVX support. The rest of the logic is slightly different as -+ # What about early AVX support? The rest of the logic is slightly different as - # we need to include some of the SSE4.1 and SSE3 instructions. So, we first check - # if we can compile AVX code without a flag, then we validate that we have support - # for the SSE4.1 and SSE3 instructions we need. If not, we check for the usage of - # the AVX flag, and then recheck if we have support for the SSE4.1 and SSE3 - # instructions. - # -- AC_CACHE_CHECK([if we are checking for AVX support], op_avx_check_avx, AS_VAR_SET(op_avx_check_avx, yes)) -+ AC_CACHE_CHECK([for AVX support], op_avx_check_avx, AS_VAR_SET(op_avx_check_avx, yes)) - AS_IF([test "$op_avx_check_avx" = "yes"], - [AC_MSG_CHECKING([for AVX support (no additional flags)]) - AC_LINK_IFELSE( - [AC_LANG_PROGRAM([[#include ]], - [[ -- __m128 vA, vB; -- _mm_add_ps(vA, vB) -+ __m256 vA, vB, vC; -+ vC = _mm256_add_ps(vA, vB) - ]])], - [op_avx_support=1 - AC_MSG_RESULT([yes])], -@@ -186,7 +186,7 @@ AC_DEFUN([MCA_ompi_op_avx_CONFIG],[ - # - # Check for SSE4.1 support - # -- AC_CACHE_CHECK([if we are checking for SSE4.1 support], op_avx_check_sse41, AS_VAR_SET(op_avx_check_sse41, yes)) -+ AC_CACHE_CHECK([for SSE4.1 support], op_avx_check_sse41, AS_VAR_SET(op_avx_check_sse41, yes)) - AS_IF([test $op_avx_support -eq 1 && test "$op_avx_check_sse41" = "yes"], - [AC_MSG_CHECKING([for SSE4.1 support]) - AC_LINK_IFELSE( -@@ -202,7 +202,7 @@ AC_DEFUN([MCA_ompi_op_avx_CONFIG],[ - # - # Check for SSE3 support - # -- AC_CACHE_CHECK([if we are checking for SSE3 support], op_avx_check_sse3, AS_VAR_SET(op_avx_check_sse3, yes)) -+ AC_CACHE_CHECK([for SSE3 support], op_avx_check_sse3, AS_VAR_SET(op_avx_check_sse3, yes)) - AS_IF([test $op_avx_support -eq 1 && test "$op_avx_check_sse3" = "yes"], - [AC_MSG_CHECKING([for SSE3 support]) - AC_LINK_IFELSE( -@@ -224,8 +224,8 @@ AC_DEFUN([MCA_ompi_op_avx_CONFIG],[ - AC_LINK_IFELSE( - [AC_LANG_PROGRAM([[#include ]], - [[ -- __m128 vA, vB; -- _mm_add_ps(vA, vB) -+ __m256 vA, vB, vC; -+ vC = _mm256_add_ps(vA, vB) - ]])], - [op_avx_support=1 - MCA_BUILD_OP_AVX_FLAGS="-mavx" -diff --git a/ompi/mca/op/avx/op_avx_functions.c b/ompi/mca/op/avx/op_avx_functions.c -index 95a9c9ab84e..ef3f0932906 100644 ---- a/ompi/mca/op/avx/op_avx_functions.c -+++ b/ompi/mca/op/avx/op_avx_functions.c -@@ -1,5 +1,5 @@ - /* -- * Copyright (c) 2019-2020 The University of Tennessee and The University -+ * Copyright (c) 2019-2021 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * Copyright (c) 2020 Research Organization for Information Science -@@ -24,16 +24,42 @@ - #include "ompi/mca/op/avx/op_avx.h" - - #include -- -+/** -+ * The following logic is necessary to cope with distro maintainer's desire to change the compilation -+ * flags after the configure step, leading to inconsistencies between what OMPI has detected and what -+ * code can be generated during make. If we detect that the current code generation architecture has -+ * been changed from our own setting and cannot generate the code we need (AVX512, AVX2) we fall back -+ * to a lesser support (AVX512 -> AVX2, AVX2 -> AVX, AVX -> error out). -+ */ - #if defined(GENERATE_AVX512_CODE) --#define PREPEND _avx512 --#elif defined(GENERATE_AVX2_CODE) --#define PREPEND _avx2 --#elif defined(GENERATE_AVX_CODE) --#define PREPEND _avx --#else --#error This file should not be compiled in this conditions --#endif -+# if defined(__AVX512BW__) && defined(__AVX512F__) && defined(__AVX512VL__) -+# define PREPEND _avx512 -+# else -+# undef GENERATE_AVX512_CODE -+# endif /* defined(__AVX512BW__) && defined(__AVX512F__) && defined(__AVX512VL__) */ -+#endif /* defined(GENERATE_AVX512_CODE) */ -+ -+#if !defined(PREPEND) && defined(GENERATE_AVX2_CODE) -+# if defined(__AVX2__) -+# define PREPEND _avx2 -+# else -+# undef GENERATE_AVX2_CODE -+# endif /* defined(__AVX2__) */ -+#endif /* !defined(PREPEND) && defined(GENERATE_AVX2_CODE) */ -+ -+#if !defined(PREPEND) && defined(GENERATE_AVX_CODE) -+# if defined(__AVX__) -+# define PREPEND _avx -+# endif -+#endif /* !defined(PREPEND) && defined(GENERATE_AVX_CODE) */ -+ -+#if !defined(PREPEND) -+# if OMPI_MCA_OP_HAVE_AVX512 || OMPI_MCA_OP_HAVE_AVX2 -+# error The configure step has detected possible support for AVX512 and/or AVX2 but the compiler flags during make are too restrictive. Please disable the AVX component by adding --enable-mca-no-build=op-avx to your configure step. -+# else -+# error This file should not be compiled in this conditions. Please provide the config.log file to the OMPI developers. -+# endif /* OMPI_MCA_OP_HAVE_AVX512 || OMPI_MCA_OP_HAVE_AVX2 */ -+#endif /* !defined(PREPEND) */ - - /* - * Concatenate preprocessor tokens A and B without expanding macro definitions -@@ -46,6 +72,102 @@ - */ - #define OP_CONCAT(A, B) OP_CONCAT_NX(A, B) - -+/* -+ * grep -e "_mm[125][251][862]_.*(" avx512.c -o | sed 's/(//g' | sort | uniq -+ * -+ * https://software.intel.com/sites/landingpage/IntrinsicsGuide -+ * -+ * _mm_add_epi[8,16,32,64] SSE2 -+ * _mm_add_pd SSE2 -+ * _mm_add_ps SSE -+ * _mm_adds_epi[8,16] SSE2 -+ * _mm_adds_epu[8,16] SSE2 -+ * _mm_and_si128 SSE2 -+ * _mm_lddqu_si128 SSE3 -+ * _mm_loadu_pd SSE2 -+ * _mm_loadu_ps SSE -+ * _mm_max_epi8 SSE4.1 -+ * _mm_max_epi16 SSE2 -+ * _mm_max_epi32 SSE4.1 -+ * _mm_max_epi64 AVX512VL + AVX512F -+ * _mm_max_epu8 SSE2 -+ * _mm_max_epu[16,32] SSE4.1 -+ * _mm_max_epu64 AVX512VL + AVX512F -+ * _mm_max_pd SSE2 -+ * _mm_max_ps SSE -+ * _mm_min_epi8 SSE4.1 -+ * _mm_min_epi16 SSE2 -+ * _mm_min_epi32 SSE4.1 -+ * _mm_min_epi64 AVX512VL + AVX512F -+ * _mm_min_epu8 SSE2 -+ * _mm_min_epu[16,32] SSE4.1 -+ * _mm_min_epu64 AVX512VL + AVX512F -+ * _mm_min_pd SSE2 -+ * _mm_min_ps SSE -+ * _mm_mul_pd SSE2 -+ * _mm_mul_ps SSE -+ * _mm_mullo_epi16 SSE2 -+ * _mm_mullo_epi32 SSE4.1 -+ * _mm_mullo_epi64 AVX512VL + AVX512DQ -+ * _mm_or_si128 SSE2 -+ * _mm_storeu_pd SSE2 -+ * _mm_storeu_ps SSE -+ * _mm_storeu_si128 SSE2 -+ * _mm_xor_si128 SSE2 -+ * _mm256_add_epi[8,16,32,64] AVX2 -+ * _mm256_add_p[s,d] AVX -+ * _mm256_adds_epi[8,16] AVX2 -+ * _mm256_adds_epu[8,16] AVX2 -+ * _mm256_and_si256 AVX2 -+ * _mm256_loadu_p[s,d] AVX -+ * _mm256_loadu_si256 AVX -+ * _mm256_max_epi[8,16,32] AVX2 -+ * _mm256_max_epi64 AVX512VL + AVX512F -+ * _mm256_max_epu[8,16,32] AVX2 -+ * _mm256_max_epu64 AVX512VL + AVX512F -+ * _mm256_max_p[s,d] AVX -+ * _mm256_min_epi[8,16,32] AVX2 -+ * _mm256_min_epi64 AVX512VL + AVX512F -+ * _mm256_min_epu[8,16,32] AVX2 -+ * _mm256_min_epu64 AVX512VL + AVX512F -+ * _mm256_min_p[s,d] AVX -+ * _mm256_mul_p[s,d] AVX -+ * _mm256_mullo_epi[16,32] AVX2 -+ * _mm256_mullo_epi64 AVX512VL + AVX512DQ -+ * _mm256_or_si256 AVX2 -+ * _mm256_storeu_p[s,d] AVX -+ * _mm256_storeu_si256 AVX -+ * _mm256_xor_si256 AVX2 -+ * _mm512_add_epi[8,16] AVX512BW -+ * _mm512_add_epi[32,64] AVX512F -+ * _mm512_add_p[s,d] AVX512F -+ * _mm512_adds_epi[8,16] AVX512BW -+ * _mm512_adds_epu[8,16] AVX512BW -+ * _mm512_and_si512 AVX512F -+ * _mm512_cvtepi16_epi8 AVX512BW -+ * _mm512_cvtepi8_epi16 AVX512BW -+ * _mm512_loadu_p[s,d] AVX512F -+ * _mm512_loadu_si512 AVX512F -+ * _mm512_max_epi[8,16] AVX512BW -+ * _mm512_max_epi[32,64] AVX512F -+ * _mm512_max_epu[8,16] AVX512BW -+ * _mm512_max_epu[32,64] AVX512F -+ * _mm512_max_p[s,d] AVX512F -+ * _mm512_min_epi[8,16] AVX512BW -+ * _mm512_min_epi[32,64] AVX512F -+ * _mm512_min_epu[8,16] AVX512BW -+ * _mm512_min_epu[32,64] AVX512F -+ * _mm512_min_p[s,d] AVX512F -+ * _mm512_mul_p[s,d] AVX512F -+ * _mm512_mullo_epi16 AVX512BW -+ * _mm512_mullo_epi32 AVX512F -+ * _mm512_mullo_epi64 AVX512DQ -+ * _mm512_or_si512 AVX512F -+ * _mm512_storeu_p[s,d] AVX512F -+ * _mm512_storeu_si512 AVX512F -+ * _mm512_xor_si512 AVX512F -+ */ -+ - /* - * Since all the functions in this file are essentially identical, we - * use a macro to substitute in names and types. The core operation -@@ -62,13 +184,14 @@ - (((_flag) & mca_op_avx_component.flags) == (_flag)) - - #if defined(GENERATE_AVX512_CODE) && defined(OMPI_MCA_OP_HAVE_AVX512) && (1 == OMPI_MCA_OP_HAVE_AVX512) -+#if __AVX512F__ - #define OP_AVX_AVX512_FUNC(name, type_sign, type_size, type, op) \ - if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_AVX512F_FLAG|OMPI_OP_AVX_HAS_AVX512BW_FLAG) ) { \ - int types_per_step = (512 / 8) / sizeof(type); \ - for( ; left_over >= types_per_step; left_over -= types_per_step ) { \ -- __m512i vecA = _mm512_loadu_si512((__m512*)in); \ -+ __m512i vecA = _mm512_loadu_si512((__m512*)in); \ - in += types_per_step; \ -- __m512i vecB = _mm512_loadu_si512((__m512*)out); \ -+ __m512i vecB = _mm512_loadu_si512((__m512*)out); \ - __m512i res = _mm512_##op##_ep##type_sign##type_size(vecA, vecB); \ - _mm512_storeu_si512((__m512*)out, res); \ - out += types_per_step; \ -@@ -76,10 +199,14 @@ - if( 0 == left_over ) return; \ - } - #else -+#error Target architecture lacks AVX512F support needed for _mm512_loadu_si512 and _mm512_storeu_si512 -+#endif /* __AVX512F__ */ -+#else - #define OP_AVX_AVX512_FUNC(name, type_sign, type_size, type, op) {} - #endif /* defined(OMPI_MCA_OP_HAVE_AVX512) && (1 == OMPI_MCA_OP_HAVE_AVX512) */ - - #if defined(GENERATE_AVX2_CODE) && defined(OMPI_MCA_OP_HAVE_AVX2) && (1 == OMPI_MCA_OP_HAVE_AVX2) -+#if __AVX__ - #define OP_AVX_AVX2_FUNC(name, type_sign, type_size, type, op) \ - if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_AVX2_FLAG | OMPI_OP_AVX_HAS_AVX_FLAG) ) { \ - int types_per_step = (256 / 8) / sizeof(type); /* AVX2 */ \ -@@ -87,30 +214,37 @@ - __m256i vecA = _mm256_loadu_si256((__m256i*)in); \ - in += types_per_step; \ - __m256i vecB = _mm256_loadu_si256((__m256i*)out); \ -- __m256i res = _mm256_##op##_ep##type_sign##type_size(vecA, vecB); \ -+ __m256i res = _mm256_##op##_ep##type_sign##type_size(vecA, vecB); \ - _mm256_storeu_si256((__m256i*)out, res); \ - out += types_per_step; \ - } \ - if( 0 == left_over ) return; \ - } - #else -+#error Target architecture lacks AVX support needed for _mm256_loadu_si256 and _mm256_storeu_si256 -+#endif /* __AVX__ */ -+#else - #define OP_AVX_AVX2_FUNC(name, type_sign, type_size, type, op) {} - #endif /* defined(OMPI_MCA_OP_HAVE_AVX2) && (1 == OMPI_MCA_OP_HAVE_AVX2) */ - - #if defined(GENERATE_SSE3_CODE) && defined(OMPI_MCA_OP_HAVE_AVX) && (1 == OMPI_MCA_OP_HAVE_AVX) -+#if __SSE3__ - #define OP_AVX_SSE4_1_FUNC(name, type_sign, type_size, type, op) \ - if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_SSE3_FLAG | OMPI_OP_AVX_HAS_SSE4_1_FLAG) ) { \ -- int types_per_step = (128 / 8) / sizeof(type); /* AVX */ \ -+ int types_per_step = (128 / 8) / sizeof(type); \ - for( ; left_over >= types_per_step; left_over -= types_per_step ) { \ - __m128i vecA = _mm_lddqu_si128((__m128i*)in); \ - in += types_per_step; \ - __m128i vecB = _mm_lddqu_si128((__m128i*)out); \ -- __m128i res = _mm_##op##_ep##type_sign##type_size(vecA, vecB); \ -+ __m128i res = _mm_##op##_ep##type_sign##type_size(vecA, vecB); \ - _mm_storeu_si128((__m128i*)out, res); \ - out += types_per_step; \ - } \ - } - #else -+#error Target architecture lacks SSE3 support needed for _mm_lddqu_si128 and _mm_storeu_si128 -+#endif /* __SSE3__ */ -+#else - #define OP_AVX_SSE4_1_FUNC(name, type_sign, type_size, type, op) {} - #endif /* defined(OMPI_MCA_OP_HAVE_AVX) && (1 == OMPI_MCA_OP_HAVE_AVX) */ - -@@ -143,12 +277,13 @@ static void OP_CONCAT(ompi_op_avx_2buff_##name##_##type,PREPEND)(const void *_in - } - - #if defined(GENERATE_AVX512_CODE) && defined(OMPI_MCA_OP_HAVE_AVX512) && (1 == OMPI_MCA_OP_HAVE_AVX512) -+#if __AVX512BW__ && __AVX__ - #define OP_AVX_AVX512_MUL(name, type_sign, type_size, type, op) \ - if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_AVX512F_FLAG | OMPI_OP_AVX_HAS_AVX512BW_FLAG) ) { \ - int types_per_step = (256 / 8) / sizeof(type); \ - for (; left_over >= types_per_step; left_over -= types_per_step) { \ -- __m256i vecA_tmp = _mm256_loadu_si256((__m256i*)in); \ -- __m256i vecB_tmp = _mm256_loadu_si256((__m256i*)out); \ -+ __m256i vecA_tmp = _mm256_loadu_si256((__m256i*)in); \ -+ __m256i vecB_tmp = _mm256_loadu_si256((__m256i*)out); \ - in += types_per_step; \ - __m512i vecA = _mm512_cvtepi8_epi16(vecA_tmp); \ - __m512i vecB = _mm512_cvtepi8_epi16(vecB_tmp); \ -@@ -160,6 +295,9 @@ static void OP_CONCAT(ompi_op_avx_2buff_##name##_##type,PREPEND)(const void *_in - if( 0 == left_over ) return; \ - } - #else -+#error Target architecture lacks AVX512BW and AVX support needed for _mm256_loadu_si256, _mm256_storeu_si256 and _mm512_cvtepi8_epi16 -+#endif /* __AVX512BW__ && __AVX__ */ -+#else - #define OP_AVX_AVX512_MUL(name, type_sign, type_size, type, op) {} - #endif /* defined(OMPI_MCA_OP_HAVE_AVX512) && (1 == OMPI_MCA_OP_HAVE_AVX512) */ - /** -@@ -201,13 +339,14 @@ static void OP_CONCAT( ompi_op_avx_2buff_##name##_##type, PREPEND)(const void *_ - * - */ - #if defined(GENERATE_AVX512_CODE) && defined(OMPI_MCA_OP_HAVE_AVX512) && (1 == OMPI_MCA_OP_HAVE_AVX512) -+#if __AVX512F__ - #define OP_AVX_AVX512_BIT_FUNC(name, type_size, type, op) \ - if( OMPI_OP_AVX_HAS_FLAGS( OMPI_OP_AVX_HAS_AVX512F_FLAG) ) { \ - types_per_step = (512 / 8) / sizeof(type); \ - for (; left_over >= types_per_step; left_over -= types_per_step) { \ -- __m512i vecA = _mm512_loadu_si512((__m512i*)in); \ -+ __m512i vecA = _mm512_loadu_si512((__m512i*)in); \ - in += types_per_step; \ -- __m512i vecB = _mm512_loadu_si512((__m512i*)out); \ -+ __m512i vecB = _mm512_loadu_si512((__m512i*)out); \ - __m512i res = _mm512_##op##_si512(vecA, vecB); \ - _mm512_storeu_si512((__m512i*)out, res); \ - out += types_per_step; \ -@@ -215,10 +354,14 @@ static void OP_CONCAT( ompi_op_avx_2buff_##name##_##type, PREPEND)(const void *_ - if( 0 == left_over ) return; \ - } - #else -+#error Target architecture lacks AVX512F support needed for _mm512_loadu_si512 and _mm512_storeu_si512 -+#endif /* __AVX512F__ */ -+#else - #define OP_AVX_AVX512_BIT_FUNC(name, type_size, type, op) {} - #endif /* defined(OMPI_MCA_OP_HAVE_AVX512) && (1 == OMPI_MCA_OP_HAVE_AVX512) */ - - #if defined(GENERATE_AVX2_CODE) && defined(OMPI_MCA_OP_HAVE_AVX2) && (1 == OMPI_MCA_OP_HAVE_AVX2) -+#if __AVX__ - #define OP_AVX_AVX2_BIT_FUNC(name, type_size, type, op) \ - if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_AVX2_FLAG | OMPI_OP_AVX_HAS_AVX_FLAG) ) { \ - types_per_step = (256 / 8) / sizeof(type); \ -@@ -226,17 +369,21 @@ static void OP_CONCAT( ompi_op_avx_2buff_##name##_##type, PREPEND)(const void *_ - __m256i vecA = _mm256_loadu_si256((__m256i*)in); \ - in += types_per_step; \ - __m256i vecB = _mm256_loadu_si256((__m256i*)out); \ -- __m256i res = _mm256_##op##_si256(vecA, vecB); \ -+ __m256i res = _mm256_##op##_si256(vecA, vecB); \ - _mm256_storeu_si256((__m256i*)out, res); \ - out += types_per_step; \ - } \ - if( 0 == left_over ) return; \ - } - #else -+#error Target architecture lacks AVX support needed for _mm256_loadu_si256 and _mm256_storeu_si256 -+#endif /* __AVX__ */ -+#else - #define OP_AVX_AVX2_BIT_FUNC(name, type_size, type, op) {} - #endif /* defined(OMPI_MCA_OP_HAVE_AVX2) && (1 == OMPI_MCA_OP_HAVE_AVX2) */ - - #if defined(GENERATE_SSE3_CODE) && defined(OMPI_MCA_OP_HAVE_AVX) && (1 == OMPI_MCA_OP_HAVE_AVX) -+#if __SSE3__ && __SSE2__ - #define OP_AVX_SSE3_BIT_FUNC(name, type_size, type, op) \ - if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_SSE3_FLAG) ) { \ - types_per_step = (128 / 8) / sizeof(type); \ -@@ -244,12 +391,15 @@ static void OP_CONCAT( ompi_op_avx_2buff_##name##_##type, PREPEND)(const void *_ - __m128i vecA = _mm_lddqu_si128((__m128i*)in); \ - in += types_per_step; \ - __m128i vecB = _mm_lddqu_si128((__m128i*)out); \ -- __m128i res = _mm_##op##_si128(vecA, vecB); \ -+ __m128i res = _mm_##op##_si128(vecA, vecB); \ - _mm_storeu_si128((__m128i*)out, res); \ - out += types_per_step; \ - } \ - } - #else -+#error Target architecture lacks SSE2 and SSE3 support needed for _mm_lddqu_si128 and _mm_storeu_si128 -+#endif /* __SSE3__ && __SSE2__ */ -+#else - #define OP_AVX_SSE3_BIT_FUNC(name, type_size, type, op) {} - #endif /* defined(OMPI_MCA_OP_HAVE_AVX) && (1 == OMPI_MCA_OP_HAVE_AVX) */ - -@@ -282,12 +432,13 @@ static void OP_CONCAT(ompi_op_avx_2buff_##name##_##type,PREPEND)(const void *_in - } - - #if defined(GENERATE_AVX512_CODE) && defined(OMPI_MCA_OP_HAVE_AVX512) && (1 == OMPI_MCA_OP_HAVE_AVX512) -+#if __AVX512F__ - #define OP_AVX_AVX512_FLOAT_FUNC(op) \ - if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_AVX512F_FLAG) ) { \ - types_per_step = (512 / 8) / sizeof(float); \ - for (; left_over >= types_per_step; left_over -= types_per_step) { \ -- __m512 vecA = _mm512_loadu_ps((__m512*)in); \ -- __m512 vecB = _mm512_loadu_ps((__m512*)out); \ -+ __m512 vecA = _mm512_loadu_ps((__m512*)in); \ -+ __m512 vecB = _mm512_loadu_ps((__m512*)out); \ - in += types_per_step; \ - __m512 res = _mm512_##op##_ps(vecA, vecB); \ - _mm512_storeu_ps((__m512*)out, res); \ -@@ -296,28 +447,36 @@ static void OP_CONCAT(ompi_op_avx_2buff_##name##_##type,PREPEND)(const void *_in - if( 0 == left_over ) return; \ - } - #else -+#error Target architecture lacks AVX512F support needed for _mm512_loadu_ps and _mm512_storeu_ps -+#endif /* __AVX512F__ */ -+#else - #define OP_AVX_AVX512_FLOAT_FUNC(op) {} - #endif /* defined(OMPI_MCA_OP_HAVE_AVX512) && (1 == OMPI_MCA_OP_HAVE_AVX512) */ - - #if defined(GENERATE_AVX2_CODE) && defined(OMPI_MCA_OP_HAVE_AVX2) && (1 == OMPI_MCA_OP_HAVE_AVX2) -+#if __AVX__ - #define OP_AVX_AVX_FLOAT_FUNC(op) \ - if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_AVX_FLAG) ) { \ - types_per_step = (256 / 8) / sizeof(float); \ - for( ; left_over >= types_per_step; left_over -= types_per_step ) { \ -- __m256 vecA = _mm256_loadu_ps(in); \ -+ __m256 vecA = _mm256_loadu_ps(in); \ - in += types_per_step; \ -- __m256 vecB = _mm256_loadu_ps(out); \ -+ __m256 vecB = _mm256_loadu_ps(out); \ - __m256 res = _mm256_##op##_ps(vecA, vecB); \ -- _mm256_storeu_ps(out, res); \ -+ _mm256_storeu_ps(out, res); \ - out += types_per_step; \ - } \ - if( 0 == left_over ) return; \ - } - #else -+#error Target architecture lacks AVX support needed for _mm256_loadu_ps and _mm256_storeu_ps -+#endif /* __AVX__ */ -+#else - #define OP_AVX_AVX_FLOAT_FUNC(op) {} - #endif /* defined(OMPI_MCA_OP_HAVE_AVX2) && (1 == OMPI_MCA_OP_HAVE_AVX2) */ - - #if defined(GENERATE_AVX_CODE) && defined(OMPI_MCA_OP_HAVE_AVX) && (1 == OMPI_MCA_OP_HAVE_AVX) -+#if __SSE__ - #define OP_AVX_SSE_FLOAT_FUNC(op) \ - if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_SSE_FLAG) ) { \ - types_per_step = (128 / 8) / sizeof(float); \ -@@ -331,6 +490,9 @@ static void OP_CONCAT(ompi_op_avx_2buff_##name##_##type,PREPEND)(const void *_in - } \ - } - #else -+#error Target architecture lacks SSE support needed for _mm_loadu_ps and _mm_storeu_ps -+#endif /* __SSE__ */ -+#else - #define OP_AVX_SSE_FLOAT_FUNC(op) {} - #endif /* defined(OMPI_MCA_OP_HAVE_AVX) && (1 == OMPI_MCA_OP_HAVE_AVX) */ - -@@ -363,13 +525,14 @@ static void OP_CONCAT(ompi_op_avx_2buff_##op##_float,PREPEND)(const void *_in, v - } - - #if defined(GENERATE_AVX512_CODE) && defined(OMPI_MCA_OP_HAVE_AVX512) && (1 == OMPI_MCA_OP_HAVE_AVX512) -+#if __AVX512F__ - #define OP_AVX_AVX512_DOUBLE_FUNC(op) \ - if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_AVX512F_FLAG) ) { \ - types_per_step = (512 / 8) / sizeof(double); \ - for (; left_over >= types_per_step; left_over -= types_per_step) { \ -- __m512d vecA = _mm512_loadu_pd(in); \ -+ __m512d vecA = _mm512_loadu_pd(in); \ - in += types_per_step; \ -- __m512d vecB = _mm512_loadu_pd(out); \ -+ __m512d vecB = _mm512_loadu_pd(out); \ - __m512d res = _mm512_##op##_pd(vecA, vecB); \ - _mm512_storeu_pd((out), res); \ - out += types_per_step; \ -@@ -377,17 +540,21 @@ static void OP_CONCAT(ompi_op_avx_2buff_##op##_float,PREPEND)(const void *_in, v - if( 0 == left_over ) return; \ - } - #else -+#error Target architecture lacks AVXF512 support needed for _mm512_loadu_pd and _mm512_storeu_pd -+#endif /* __AVXF512__ */ -+#else - #define OP_AVX_AVX512_DOUBLE_FUNC(op) {} - #endif /* defined(OMPI_MCA_OP_HAVE_AVX512) && (1 == OMPI_MCA_OP_HAVE_AVX512) */ - - #if defined(GENERATE_AVX2_CODE) && defined(OMPI_MCA_OP_HAVE_AVX2) && (1 == OMPI_MCA_OP_HAVE_AVX2) -+#if __AVX__ - #define OP_AVX_AVX_DOUBLE_FUNC(op) \ - if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_AVX_FLAG) ) { \ - types_per_step = (256 / 8) / sizeof(double); \ - for( ; left_over >= types_per_step; left_over -= types_per_step ) { \ -- __m256d vecA = _mm256_loadu_pd(in); \ -+ __m256d vecA = _mm256_loadu_pd(in); \ - in += types_per_step; \ -- __m256d vecB = _mm256_loadu_pd(out); \ -+ __m256d vecB = _mm256_loadu_pd(out); \ - __m256d res = _mm256_##op##_pd(vecA, vecB); \ - _mm256_storeu_pd(out, res); \ - out += types_per_step; \ -@@ -395,10 +562,14 @@ static void OP_CONCAT(ompi_op_avx_2buff_##op##_float,PREPEND)(const void *_in, v - if( 0 == left_over ) return; \ - } - #else -+#error Target architecture lacks AVX support needed for _mm256_loadu_pd and _mm256_storeu_pd -+#endif /* __AVX__ */ -+#else - #define OP_AVX_AVX_DOUBLE_FUNC(op) {} - #endif /* defined(OMPI_MCA_OP_HAVE_AVX2) && (1 == OMPI_MCA_OP_HAVE_AVX2) */ - - #if defined(GENERATE_AVX_CODE) && defined(OMPI_MCA_OP_HAVE_AVX) && (1 == OMPI_MCA_OP_HAVE_AVX) -+#if __SSE2__ - #define OP_AVX_SSE2_DOUBLE_FUNC(op) \ - if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_SSE2_FLAG) ) { \ - types_per_step = (128 / 8) / sizeof(double); \ -@@ -412,6 +583,9 @@ static void OP_CONCAT(ompi_op_avx_2buff_##op##_float,PREPEND)(const void *_in, v - } \ - } - #else -+#error Target architecture lacks SSE2 support needed for _mm_loadu_pd and _mm_storeu_pd -+#endif /* __SSE2__ */ -+#else - #define OP_AVX_SSE2_DOUBLE_FUNC(op) {} - #endif /* defined(OMPI_MCA_OP_HAVE_AVX) && (1 == OMPI_MCA_OP_HAVE_AVX) */ - -@@ -580,12 +754,13 @@ static void OP_CONCAT(ompi_op_avx_2buff_##op##_double,PREPEND)(const void *_in, - * routines, needed for some optimizations. - */ - #if defined(GENERATE_AVX512_CODE) && defined(OMPI_MCA_OP_HAVE_AVX512) && (1 == OMPI_MCA_OP_HAVE_AVX512) -+#if __AVX512F__ - #define OP_AVX_AVX512_FUNC_3(name, type_sign, type_size, type, op) \ - if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_AVX512F_FLAG|OMPI_OP_AVX_HAS_AVX512BW_FLAG) ) { \ - int types_per_step = (512 / 8) / sizeof(type); \ - for (; left_over >= types_per_step; left_over -= types_per_step) { \ -- __m512i vecA = _mm512_loadu_si512(in1); \ -- __m512i vecB = _mm512_loadu_si512(in2); \ -+ __m512i vecA = _mm512_loadu_si512(in1); \ -+ __m512i vecB = _mm512_loadu_si512(in2); \ - in1 += types_per_step; \ - in2 += types_per_step; \ - __m512i res = _mm512_##op##_ep##type_sign##type_size(vecA, vecB); \ -@@ -595,10 +770,14 @@ static void OP_CONCAT(ompi_op_avx_2buff_##op##_double,PREPEND)(const void *_in, - if( 0 == left_over ) return; \ - } - #else -+#error Target architecture lacks AVX512F support needed for _mm512_loadu_si512 and _mm512_storeu_si512 -+#endif /* __AVX512F__ */ -+#else - #define OP_AVX_AVX512_FUNC_3(name, type_sign, type_size, type, op) {} - #endif /* defined(OMPI_MCA_OP_HAVE_AVX512) && (1 == OMPI_MCA_OP_HAVE_AVX512) */ - - #if defined(GENERATE_AVX2_CODE) && defined(OMPI_MCA_OP_HAVE_AVX2) && (1 == OMPI_MCA_OP_HAVE_AVX2) -+#if __AVX__ - #define OP_AVX_AVX2_FUNC_3(name, type_sign, type_size, type, op) \ - if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_AVX2_FLAG | OMPI_OP_AVX_HAS_AVX_FLAG) ) { \ - int types_per_step = (256 / 8) / sizeof(type); \ -@@ -607,17 +786,21 @@ static void OP_CONCAT(ompi_op_avx_2buff_##op##_double,PREPEND)(const void *_in, - __m256i vecB = _mm256_loadu_si256((__m256i*)in2); \ - in1 += types_per_step; \ - in2 += types_per_step; \ -- __m256i res = _mm256_##op##_ep##type_sign##type_size(vecA, vecB); \ -+ __m256i res = _mm256_##op##_ep##type_sign##type_size(vecA, vecB); \ - _mm256_storeu_si256((__m256i*)out, res); \ - out += types_per_step; \ - } \ - if( 0 == left_over ) return; \ - } - #else -+#error Target architecture lacks AVX support needed for _mm256_loadu_si256 and _mm256_storeu_si256 -+#endif /* __AVX__ */ -+#else - #define OP_AVX_AVX2_FUNC_3(name, type_sign, type_size, type, op) {} - #endif /* defined(OMPI_MCA_OP_HAVE_AVX2) && (1 == OMPI_MCA_OP_HAVE_AVX2) */ - - #if defined(GENERATE_SSE3_CODE) && defined(OMPI_MCA_OP_HAVE_SSE41) && (1 == OMPI_MCA_OP_HAVE_SSE41) && defined(OMPI_MCA_OP_HAVE_AVX) && (1 == OMPI_MCA_OP_HAVE_AVX) -+#if __SSE3__ && __SSE2__ - #define OP_AVX_SSE4_1_FUNC_3(name, type_sign, type_size, type, op) \ - if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_SSE3_FLAG | OMPI_OP_AVX_HAS_SSE4_1_FLAG) ) { \ - int types_per_step = (128 / 8) / sizeof(type); \ -@@ -626,12 +809,15 @@ static void OP_CONCAT(ompi_op_avx_2buff_##op##_double,PREPEND)(const void *_in, - __m128i vecB = _mm_lddqu_si128((__m128i*)in2); \ - in1 += types_per_step; \ - in2 += types_per_step; \ -- __m128i res = _mm_##op##_ep##type_sign##type_size(vecA, vecB); \ -+ __m128i res = _mm_##op##_ep##type_sign##type_size(vecA, vecB); \ - _mm_storeu_si128((__m128i*)out, res); \ - out += types_per_step; \ - } \ - } - #else -+#error Target architecture lacks SSE2 and SSE3 support needed for _mm_lddqu_si128 and _mm_storeu_si128 -+#endif /* __SSE3__ && __SSE2__ */ -+#else - #define OP_AVX_SSE4_1_FUNC_3(name, type_sign, type_size, type, op) {} - #endif /* defined(OMPI_MCA_OP_HAVE_AVX) && (1 == OMPI_MCA_OP_HAVE_AVX) */ - -@@ -667,12 +853,13 @@ static void OP_CONCAT(ompi_op_avx_3buff_##name##_##type,PREPEND)(const void * re - } - - #if defined(GENERATE_AVX512_CODE) && defined(OMPI_MCA_OP_HAVE_AVX512) && (1 == OMPI_MCA_OP_HAVE_AVX512) -+#if __AVX512BW__ && __AVX__ - #define OP_AVX_AVX512_MUL_3(name, type_sign, type_size, type, op) \ - if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_AVX512F_FLAG | OMPI_OP_AVX_HAS_AVX512BW_FLAG) ) { \ - int types_per_step = (256 / 8) / sizeof(type); \ - for (; left_over >= types_per_step; left_over -= types_per_step) { \ -- __m256i vecA_tmp = _mm256_loadu_si256((__m256i*)in1); \ -- __m256i vecB_tmp = _mm256_loadu_si256((__m256i*)in2); \ -+ __m256i vecA_tmp = _mm256_loadu_si256((__m256i*)in1); \ -+ __m256i vecB_tmp = _mm256_loadu_si256((__m256i*)in2); \ - in1 += types_per_step; \ - in2 += types_per_step; \ - __m512i vecA = _mm512_cvtepi8_epi16(vecA_tmp); \ -@@ -685,6 +872,9 @@ static void OP_CONCAT(ompi_op_avx_3buff_##name##_##type,PREPEND)(const void * re - if( 0 == left_over ) return; \ - } - #else -+#error Target architecture lacks AVX512BW and AVX support needed for _mm256_loadu_si256, _mm256_storeu_si256 and _mm512_cvtepi8_epi16 -+#endif /* __AVX512BW__ && __AVX__ */ -+#else - #define OP_AVX_AVX512_MUL_3(name, type_sign, type_size, type, op) {} - #endif /* defined(OMPI_MCA_OP_HAVE_AVX512) && (1 == OMPI_MCA_OP_HAVE_AVX512) */ - /** -@@ -723,12 +913,13 @@ static void OP_CONCAT(ompi_op_avx_3buff_##name##_##type,PREPEND)(const void * re - } - - #if defined(GENERATE_AVX512_CODE) && defined(OMPI_MCA_OP_HAVE_AVX512) && (1 == OMPI_MCA_OP_HAVE_AVX512) -+#if __AVX512F__ - #define OP_AVX_AVX512_BIT_FUNC_3(name, type_size, type, op) \ - if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_AVX512F_FLAG) ) { \ - types_per_step = (512 / 8) / sizeof(type); \ - for (; left_over >= types_per_step; left_over -= types_per_step) { \ -- __m512i vecA = _mm512_loadu_si512(in1); \ -- __m512i vecB = _mm512_loadu_si512(in2); \ -+ __m512i vecA = _mm512_loadu_si512(in1); \ -+ __m512i vecB = _mm512_loadu_si512(in2); \ - in1 += types_per_step; \ - in2 += types_per_step; \ - __m512i res = _mm512_##op##_si512(vecA, vecB); \ -@@ -738,10 +929,14 @@ static void OP_CONCAT(ompi_op_avx_3buff_##name##_##type,PREPEND)(const void * re - if( 0 == left_over ) return; \ - } - #else -+#error Target architecture lacks AVX512F support needed for _mm512_loadu_si512 and _mm512_storeu_si512 -+#endif /* __AVX512F__ */ -+#else - #define OP_AVX_AVX512_BIT_FUNC_3(name, type_size, type, op) {} - #endif /* defined(OMPI_MCA_OP_HAVE_AVX512) && (1 == OMPI_MCA_OP_HAVE_AVX512) */ - - #if defined(GENERATE_AVX2_CODE) && defined(OMPI_MCA_OP_HAVE_AVX2) && (1 == OMPI_MCA_OP_HAVE_AVX2) -+#if __AVX__ - #define OP_AVX_AVX2_BIT_FUNC_3(name, type_size, type, op) \ - if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_AVX2_FLAG | OMPI_OP_AVX_HAS_AVX_FLAG) ) { \ - types_per_step = (256 / 8) / sizeof(type); \ -@@ -750,17 +945,21 @@ static void OP_CONCAT(ompi_op_avx_3buff_##name##_##type,PREPEND)(const void * re - __m256i vecB = _mm256_loadu_si256((__m256i*)in2); \ - in1 += types_per_step; \ - in2 += types_per_step; \ -- __m256i res = _mm256_##op##_si256(vecA, vecB); \ -+ __m256i res = _mm256_##op##_si256(vecA, vecB); \ - _mm256_storeu_si256((__m256i*)out, res); \ - out += types_per_step; \ - } \ - if( 0 == left_over ) return; \ - } - #else -+#error Target architecture lacks AVX support needed for _mm256_loadu_si256 and _mm256_storeu_si256 -+#endif /* __AVX__ */ -+#else - #define OP_AVX_AVX2_BIT_FUNC_3(name, type_size, type, op) {} - #endif /* defined(OMPI_MCA_OP_HAVE_AVX2) && (1 == OMPI_MCA_OP_HAVE_AVX2) */ - - #if defined(GENERATE_SSE3_CODE) && defined(OMPI_MCA_OP_HAVE_AVX) && (1 == OMPI_MCA_OP_HAVE_AVX) -+#if __SSE3__ && __SSE2__ - #define OP_AVX_SSE3_BIT_FUNC_3(name, type_size, type, op) \ - if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_SSE3_FLAG) ) { \ - types_per_step = (128 / 8) / sizeof(type); \ -@@ -769,12 +968,15 @@ static void OP_CONCAT(ompi_op_avx_3buff_##name##_##type,PREPEND)(const void * re - __m128i vecB = _mm_lddqu_si128((__m128i*)in2); \ - in1 += types_per_step; \ - in2 += types_per_step; \ -- __m128i res = _mm_##op##_si128(vecA, vecB); \ -+ __m128i res = _mm_##op##_si128(vecA, vecB); \ - _mm_storeu_si128((__m128i*)out, res); \ - out += types_per_step; \ - } \ - } - #else -+#error Target architecture lacks SSE2 and SSE3 support needed for _mm_lddqu_si128 and _mm_storeu_si128 -+#endif /* __SSE3__ && __SSE2__ */ -+#else - #define OP_AVX_SSE3_BIT_FUNC_3(name, type_size, type, op) {} - #endif /* defined(OMPI_MCA_OP_HAVE_AVX) && (1 == OMPI_MCA_OP_HAVE_AVX) */ - -@@ -809,12 +1011,13 @@ static void OP_CONCAT(ompi_op_avx_3buff_##op##_##type,PREPEND)(const void *_in1, - } - - #if defined(GENERATE_AVX512_CODE) && defined(OMPI_MCA_OP_HAVE_AVX512) && (1 == OMPI_MCA_OP_HAVE_AVX512) -+#if __AVX512F__ - #define OP_AVX_AVX512_FLOAT_FUNC_3(op) \ - if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_AVX512F_FLAG) ) { \ - types_per_step = (512 / 8) / sizeof(float); \ - for (; left_over >= types_per_step; left_over -= types_per_step) { \ -- __m512 vecA = _mm512_loadu_ps(in1); \ -- __m512 vecB = _mm512_loadu_ps(in2); \ -+ __m512 vecA = _mm512_loadu_ps(in1); \ -+ __m512 vecB = _mm512_loadu_ps(in2); \ - in1 += types_per_step; \ - in2 += types_per_step; \ - __m512 res = _mm512_##op##_ps(vecA, vecB); \ -@@ -824,16 +1027,20 @@ static void OP_CONCAT(ompi_op_avx_3buff_##op##_##type,PREPEND)(const void *_in1, - if( 0 == left_over ) return; \ - } - #else -+#error Target architecture lacks AVX512F support needed for _mm512_loadu_ps and _mm512_storeu_ps -+#endif /* __AVX512F__ */ -+#else - #define OP_AVX_AVX512_FLOAT_FUNC_3(op) {} - #endif /* defined(OMPI_MCA_OP_HAVE_AVX512) && (1 == OMPI_MCA_OP_HAVE_AVX512) */ - - #if defined(GENERATE_AVX2_CODE) && defined(OMPI_MCA_OP_HAVE_AVX2) && (1 == OMPI_MCA_OP_HAVE_AVX2) -+#if __AVX__ - #define OP_AVX_AVX_FLOAT_FUNC_3(op) \ - if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_AVX_FLAG) ) { \ - types_per_step = (256 / 8) / sizeof(float); \ - for( ; left_over >= types_per_step; left_over -= types_per_step ) { \ -- __m256 vecA = _mm256_loadu_ps(in1); \ -- __m256 vecB = _mm256_loadu_ps(in2); \ -+ __m256 vecA = _mm256_loadu_ps(in1); \ -+ __m256 vecB = _mm256_loadu_ps(in2); \ - in1 += types_per_step; \ - in2 += types_per_step; \ - __m256 res = _mm256_##op##_ps(vecA, vecB); \ -@@ -843,10 +1050,14 @@ static void OP_CONCAT(ompi_op_avx_3buff_##op##_##type,PREPEND)(const void *_in1, - if( 0 == left_over ) return; \ - } - #else -+#error Target architecture lacks AVX support needed for _mm256_loadu_ps and _mm256_storeu_ps -+#endif /* __AVX__ */ -+#else - #define OP_AVX_AVX_FLOAT_FUNC_3(op) {} - #endif /* defined(OMPI_MCA_OP_HAVE_AVX2) && (1 == OMPI_MCA_OP_HAVE_AVX2) */ - - #if defined(GENERATE_AVX_CODE) && defined(OMPI_MCA_OP_HAVE_AVX) && (1 == OMPI_MCA_OP_HAVE_AVX) -+#if __SSE__ - #define OP_AVX_SSE_FLOAT_FUNC_3(op) \ - if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_SSE_FLAG) ) { \ - types_per_step = (128 / 8) / sizeof(float); \ -@@ -861,6 +1072,9 @@ static void OP_CONCAT(ompi_op_avx_3buff_##op##_##type,PREPEND)(const void *_in1, - } \ - } - #else -+#error Target architecture lacks SSE support needed for _mm_loadu_ps and _mm_storeu_ps -+#endif /* __SSE__ */ -+#else - #define OP_AVX_SSE_FLOAT_FUNC_3(op) {} - #endif /* defined(OMPI_MCA_OP_HAVE_AVX) && (1 == OMPI_MCA_OP_HAVE_AVX) */ - -@@ -895,12 +1109,13 @@ static void OP_CONCAT(ompi_op_avx_3buff_##op##_float,PREPEND)(const void *_in1, - } - - #if defined(GENERATE_AVX512_CODE) && defined(OMPI_MCA_OP_HAVE_AVX512) && (1 == OMPI_MCA_OP_HAVE_AVX512) -+#if __AVX512F__ - #define OP_AVX_AVX512_DOUBLE_FUNC_3(op) \ - if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_AVX512F_FLAG) ) { \ - types_per_step = (512 / 8) / sizeof(double); \ - for (; left_over >= types_per_step; left_over -= types_per_step) { \ -- __m512d vecA = _mm512_loadu_pd((in1)); \ -- __m512d vecB = _mm512_loadu_pd((in2)); \ -+ __m512d vecA = _mm512_loadu_pd((in1)); \ -+ __m512d vecB = _mm512_loadu_pd((in2)); \ - in1 += types_per_step; \ - in2 += types_per_step; \ - __m512d res = _mm512_##op##_pd(vecA, vecB); \ -@@ -910,16 +1125,20 @@ static void OP_CONCAT(ompi_op_avx_3buff_##op##_float,PREPEND)(const void *_in1, - if( 0 == left_over ) return; \ - } - #else -+#error Target architecture lacks AVXF512 support needed for _mm512_loadu_pd and _mm512_storeu_pd -+#endif /* __AVXF512__ */ -+#else - #define OP_AVX_AVX512_DOUBLE_FUNC_3(op) {} - #endif /* defined(OMPI_MCA_OP_HAVE_AVX512) && (1 == OMPI_MCA_OP_HAVE_AVX512) */ - - #if defined(GENERATE_AVX2_CODE) && defined(OMPI_MCA_OP_HAVE_AVX2) && (1 == OMPI_MCA_OP_HAVE_AVX2) -+#if __AVX__ - #define OP_AVX_AVX_DOUBLE_FUNC_3(op) \ - if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_AVX_FLAG) ) { \ - types_per_step = (256 / 8) / sizeof(double); \ - for( ; left_over >= types_per_step; left_over -= types_per_step ) { \ -- __m256d vecA = _mm256_loadu_pd(in1); \ -- __m256d vecB = _mm256_loadu_pd(in2); \ -+ __m256d vecA = _mm256_loadu_pd(in1); \ -+ __m256d vecB = _mm256_loadu_pd(in2); \ - in1 += types_per_step; \ - in2 += types_per_step; \ - __m256d res = _mm256_##op##_pd(vecA, vecB); \ -@@ -929,10 +1148,14 @@ static void OP_CONCAT(ompi_op_avx_3buff_##op##_float,PREPEND)(const void *_in1, - if( 0 == left_over ) return; \ - } - #else -+#error Target architecture lacks AVX support needed for _mm256_loadu_pd and _mm256_storeu_pd -+#endif /* __AVX__ */ -+#else - #define OP_AVX_AVX_DOUBLE_FUNC_3(op) {} - #endif /* defined(OMPI_MCA_OP_HAVE_AVX2) && (1 == OMPI_MCA_OP_HAVE_AVX2) */ - - #if defined(GENERATE_AVX_CODE) && defined(OMPI_MCA_OP_HAVE_AVX) && (1 == OMPI_MCA_OP_HAVE_AVX) -+#if __SSE2__ - #define OP_AVX_SSE2_DOUBLE_FUNC_3(op) \ - if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_SSE2_FLAG) ) { \ - types_per_step = (128 / 8) / sizeof(double); \ -@@ -947,6 +1170,9 @@ static void OP_CONCAT(ompi_op_avx_3buff_##op##_float,PREPEND)(const void *_in1, - } \ - } - #else -+#error Target architecture lacks SSE2 support needed for _mm_loadu_pd and _mm_storeu_pd -+#endif /* __SSE2__ */ -+#else - #define OP_AVX_SSE2_DOUBLE_FUNC_3(op) {} - #endif /* defined(OMPI_MCA_OP_HAVE_AVX) && (1 == OMPI_MCA_OP_HAVE_AVX) */ - - -From 20be3fc25713ac2de3eb4d77b85248d7fe2bc28b Mon Sep 17 00:00:00 2001 -From: George Bosilca -Date: Tue, 5 Jan 2021 22:40:26 -0500 -Subject: [PATCH 3/3] A better test for MPI_OP performance. - -The test now has the ability to add a shift to all or to any of the -input and output buffers to assess the impact of unaligned operations. - -Signed-off-by: George Bosilca ---- - test/datatype/reduce_local.c | 161 ++++++++++++++++++++++------------- - 1 file changed, 104 insertions(+), 57 deletions(-) - -diff --git a/test/datatype/reduce_local.c b/test/datatype/reduce_local.c -index 97890f94227..f227439b714 100644 ---- a/test/datatype/reduce_local.c -+++ b/test/datatype/reduce_local.c -@@ -59,7 +59,7 @@ static int total_errors = 0; - _a < _b ? _a : _b; }) - - static void print_status(char* op, char* type, int type_size, -- int count, double duration, -+ int count, int max_shift, double *duration, int repeats, - int correct ) - { - if(correct) { -@@ -68,7 +68,15 @@ static void print_status(char* op, char* type, int type_size, - printf("%-10s %s [\033[1;31mfail\033[0m]", op, type); - total_errors++; - } -- printf(" count %-10d time %.6f seconds\n", count, duration); -+ if( 1 == max_shift ) { -+ printf(" count %-10d time (seconds) %.8f seconds\n", count, duration[0] / repeats); -+ } else { -+ printf(" count %-10d time (seconds / shifts) ", count); -+ for( int i = 0; i < max_shift; i++ ) { -+ printf("%.8f ", duration[i] / repeats ); -+ } -+ printf("\n"); -+ } - } - - static int do_ops_built = 0; -@@ -115,19 +123,23 @@ do { \ - const TYPE *_p1 = ((TYPE*)(INBUF)), *_p3 = ((TYPE*)(CHECK_BUF)); \ - TYPE *_p2 = ((TYPE*)(INOUT_BUF)); \ - skip_op_type = 0; \ -- for(int _k = 0; _k < min((COUNT), 4); +_k++ ) { \ -- memcpy(_p2, _p3, sizeof(TYPE) * (COUNT)); \ -- tstart = MPI_Wtime(); \ -- MPI_Reduce_local(_p1+_k, _p2+_k, (COUNT)-_k, (MPITYPE), (MPIOP)); \ -- tend = MPI_Wtime(); \ -- if( check ) { \ -- for( i = 0; i < (COUNT)-_k; i++ ) { \ -- if(((_p2+_k)[i]) == (((_p1+_k)[i]) OPNAME ((_p3+_k)[i]))) \ -- continue; \ -- printf("First error at alignment %d position %d (%" TYPE_PREFIX " %s %" TYPE_PREFIX " != %" TYPE_PREFIX ")\n", \ -- _k, i, (_p1+_k)[i], (#OPNAME), (_p3+_k)[i], (_p2+_k)[i]); \ -- correctness = 0; \ -- break; \ -+ for(int _k = 0; _k < min((COUNT), max_shift); +_k++ ) { \ -+ duration[_k] = 0.0; \ -+ for(int _r = repeats; _r > 0; _r--) { \ -+ memcpy(_p2, _p3, sizeof(TYPE) * (COUNT)); \ -+ tstart = MPI_Wtime(); \ -+ MPI_Reduce_local(_p1+_k, _p2+_k, (COUNT)-_k, (MPITYPE), (MPIOP)); \ -+ tend = MPI_Wtime(); \ -+ duration[_k] += (tend - tstart); \ -+ if( check ) { \ -+ for( i = 0; i < (COUNT)-_k; i++ ) { \ -+ if(((_p2+_k)[i]) == (((_p1+_k)[i]) OPNAME ((_p3+_k)[i]))) \ -+ continue; \ -+ printf("First error at alignment %d position %d (%" TYPE_PREFIX " %s %" TYPE_PREFIX " != %" TYPE_PREFIX ")\n", \ -+ _k, i, (_p1+_k)[i], (#OPNAME), (_p3+_k)[i], (_p2+_k)[i]); \ -+ correctness = 0; \ -+ break; \ -+ } \ - } \ - } \ - } \ -@@ -139,20 +151,24 @@ do { \ - const TYPE *_p1 = ((TYPE*)(INBUF)), *_p3 = ((TYPE*)(CHECK_BUF)); \ - TYPE *_p2 = ((TYPE*)(INOUT_BUF)); \ - skip_op_type = 0; \ -- for(int _k = 0; _k < min((COUNT), 4); +_k++ ) { \ -- memcpy(_p2, _p3, sizeof(TYPE) * (COUNT)); \ -- tstart = MPI_Wtime(); \ -- MPI_Reduce_local(_p1+_k, _p2+_k, (COUNT), (MPITYPE), (MPIOP)); \ -- tend = MPI_Wtime(); \ -- if( check ) { \ -- for( i = 0; i < (COUNT); i++ ) { \ -- TYPE _v1 = *(_p1+_k), _v2 = *(_p2+_k), _v3 = *(_p3+_k); \ -- if(_v2 == OPNAME(_v1, _v3)) \ -- continue; \ -- printf("First error at alignment %d position %d (%" TYPE_PREFIX " != %s(%" TYPE_PREFIX ", %" TYPE_PREFIX ")\n", \ -- _k, i, _v1, (#OPNAME), _v3, _v2); \ -- correctness = 0; \ -- break; \ -+ for(int _k = 0; _k < min((COUNT), max_shift); +_k++ ) { \ -+ duration[_k] = 0.0; \ -+ for(int _r = repeats; _r > 0; _r--) { \ -+ memcpy(_p2, _p3, sizeof(TYPE) * (COUNT)); \ -+ tstart = MPI_Wtime(); \ -+ MPI_Reduce_local(_p1+_k, _p2+_k, (COUNT), (MPITYPE), (MPIOP)); \ -+ tend = MPI_Wtime(); \ -+ duration[_k] += (tend - tstart); \ -+ if( check ) { \ -+ for( i = 0; i < (COUNT); i++ ) { \ -+ TYPE _v1 = *(_p1+_k), _v2 = *(_p2+_k), _v3 = *(_p3+_k); \ -+ if(_v2 == OPNAME(_v1, _v3)) \ -+ continue; \ -+ printf("First error at alignment %d position %d (%" TYPE_PREFIX " != %s(%" TYPE_PREFIX ", %" TYPE_PREFIX ")\n", \ -+ _k, i, _v1, (#OPNAME), _v3, _v2); \ -+ correctness = 0; \ -+ break; \ -+ } \ - } \ - } \ - } \ -@@ -163,24 +179,36 @@ int main(int argc, char **argv) - { - static void *in_buf = NULL, *inout_buf = NULL, *inout_check_buf = NULL; - int count, type_size = 8, rank, size, provided, correctness = 1; -- int repeats = 1, i, c; -- double tstart, tend; -+ int repeats = 1, i, c, op1_alignment = 0, res_alignment = 0; -+ int max_shift = 4; -+ double *duration, tstart, tend; - bool check = true; - char type[5] = "uifd", *op = "sum", *mpi_type; - int lower = 1, upper = 1000000, skip_op_type; - MPI_Op mpi_op; - -- while( -1 != (c = getopt(argc, argv, "l:u:t:o:s:n:vfh")) ) { -+ while( -1 != (c = getopt(argc, argv, "l:u:r:t:o:i:s:n:1:2:vfh")) ) { - switch(c) { - case 'l': - lower = atoi(optarg); - if( lower <= 0 ) { -- fprintf(stderr, "The number of elements must be positive\n"); -+ fprintf(stderr, "The lower number of elements must be positive\n"); - exit(-1); - } - break; - case 'u': - upper = atoi(optarg); -+ if( lower <= 0 ) { -+ fprintf(stderr, "The upper number of elements must be positive\n"); -+ exit(-1); -+ } -+ break; -+ case 'i': -+ max_shift = atoi(optarg); -+ if( max_shift <= 0 ) { -+ fprintf(stderr, "The max shift must be positive\n"); -+ exit(-1); -+ } - break; - case 'f': - check = false; -@@ -216,14 +244,32 @@ int main(int argc, char **argv) - exit(-1); - } - break; -+ case '1': -+ op1_alignment = atoi(optarg); -+ if( op1_alignment < 0 ) { -+ fprintf(stderr, "alignment for the first operand must be positive\n"); -+ exit(-1); -+ } -+ break; -+ case '2': -+ res_alignment = atoi(optarg); -+ if( res_alignment < 0 ) { -+ fprintf(stderr, "alignment for the result must be positive\n"); -+ exit(-1); -+ } -+ break; - case 'h': - fprintf(stdout, "%s options are:\n" - " -l : lower number of elements\n" - " -u : upper number of elements\n" - " -s : 8, 16, 32 or 64 bits elements\n" - " -t [i,u,f,d] : type of the elements to apply the operations on\n" -+ " -r : number of repetitions for each test\n" - " -o : comma separated list of operations to execute among\n" - " sum, min, max, prod, bor, bxor, band\n" -+ " -i : shift on all buffers to check alignment\n" -+ " -1 : (mis)alignment in elements for the first op\n" -+ " -2 : (mis)alignment in elements for the result\n" - " -v: increase the verbosity level\n" - " -h: this help message\n", argv[0]); - exit(0); -@@ -233,9 +279,10 @@ int main(int argc, char **argv) - if( !do_ops_built ) { /* not yet done, take the default */ - build_do_ops( "all", do_ops); - } -- in_buf = malloc(upper * sizeof(double)); -- inout_buf = malloc(upper * sizeof(double)); -- inout_check_buf = malloc(upper * sizeof(double)); -+ posix_memalign( &in_buf, 64, (upper + op1_alignment) * sizeof(double)); -+ posix_memalign( &inout_buf, 64, (upper + res_alignment) * sizeof(double)); -+ posix_memalign( &inout_check_buf, 64, upper * sizeof(double)); -+ duration = (double*)malloc(max_shift * sizeof(double)); - - ompi_mpi_init(argc, argv, MPI_THREAD_SERIALIZED, &provided, false); - -@@ -253,8 +300,8 @@ int main(int argc, char **argv) - correctness = 1; - if('i' == type[type_idx]) { - if( 8 == type_size ) { -- int8_t *in_int8 = (int8_t*)in_buf, -- *inout_int8 = (int8_t*)inout_buf, -+ int8_t *in_int8 = (int8_t*)((char*)in_buf + op1_alignment * sizeof(int8_t)), -+ *inout_int8 = (int8_t*)((char*)inout_buf + res_alignment * sizeof(int8_t)), - *inout_int8_for_check = (int8_t*)inout_check_buf; - for( i = 0; i < count; i++ ) { - in_int8[i] = 5; -@@ -299,8 +346,8 @@ int main(int argc, char **argv) - } - } - if( 16 == type_size ) { -- int16_t *in_int16 = (int16_t*)in_buf, -- *inout_int16 = (int16_t*)inout_buf, -+ int16_t *in_int16 = (int16_t*)((char*)in_buf + op1_alignment * sizeof(int16_t)), -+ *inout_int16 = (int16_t*)((char*)inout_buf + res_alignment * sizeof(int16_t)), - *inout_int16_for_check = (int16_t*)inout_check_buf; - for( i = 0; i < count; i++ ) { - in_int16[i] = 5; -@@ -345,8 +392,8 @@ int main(int argc, char **argv) - } - } - if( 32 == type_size ) { -- int32_t *in_int32 = (int32_t*)in_buf, -- *inout_int32 = (int32_t*)inout_buf, -+ int32_t *in_int32 = (int32_t*)((char*)in_buf + op1_alignment * sizeof(int32_t)), -+ *inout_int32 = (int32_t*)((char*)inout_buf + res_alignment * sizeof(int32_t)), - *inout_int32_for_check = (int32_t*)inout_check_buf; - for( i = 0; i < count; i++ ) { - in_int32[i] = 5; -@@ -391,8 +438,8 @@ int main(int argc, char **argv) - } - } - if( 64 == type_size ) { -- int64_t *in_int64 = (int64_t*)in_buf, -- *inout_int64 = (int64_t*)inout_buf, -+ int64_t *in_int64 = (int64_t*)((char*)in_buf + op1_alignment * sizeof(int64_t)), -+ *inout_int64 = (int64_t*)((char*)inout_buf + res_alignment * sizeof(int64_t)), - *inout_int64_for_check = (int64_t*)inout_check_buf; - for( i = 0; i < count; i++ ) { - in_int64[i] = 5; -@@ -440,8 +487,8 @@ int main(int argc, char **argv) - - if( 'u' == type[type_idx] ) { - if( 8 == type_size ) { -- uint8_t *in_uint8 = (uint8_t*)in_buf, -- *inout_uint8 = (uint8_t*)inout_buf, -+ uint8_t *in_uint8 = (uint8_t*)((char*)in_buf + op1_alignment * sizeof(uint8_t)), -+ *inout_uint8 = (uint8_t*)((char*)inout_buf + res_alignment * sizeof(uint8_t)), - *inout_uint8_for_check = (uint8_t*)inout_check_buf; - for( i = 0; i < count; i++ ) { - in_uint8[i] = 5; -@@ -486,8 +533,8 @@ int main(int argc, char **argv) - } - } - if( 16 == type_size ) { -- uint16_t *in_uint16 = (uint16_t*)in_buf, -- *inout_uint16 = (uint16_t*)inout_buf, -+ uint16_t *in_uint16 = (uint16_t*)((char*)in_buf + op1_alignment * sizeof(uint16_t)), -+ *inout_uint16 = (uint16_t*)((char*)inout_buf + res_alignment * sizeof(uint16_t)), - *inout_uint16_for_check = (uint16_t*)inout_check_buf; - for( i = 0; i < count; i++ ) { - in_uint16[i] = 5; -@@ -532,8 +579,8 @@ int main(int argc, char **argv) - } - } - if( 32 == type_size ) { -- uint32_t *in_uint32 = (uint32_t*)in_buf, -- *inout_uint32 = (uint32_t*)inout_buf, -+ uint32_t *in_uint32 = (uint32_t*)((char*)in_buf + op1_alignment * sizeof(uint32_t)), -+ *inout_uint32 = (uint32_t*)((char*)inout_buf + res_alignment * sizeof(uint32_t)), - *inout_uint32_for_check = (uint32_t*)inout_check_buf; - for( i = 0; i < count; i++ ) { - in_uint32[i] = 5; -@@ -578,8 +625,8 @@ int main(int argc, char **argv) - } - } - if( 64 == type_size ) { -- uint64_t *in_uint64 = (uint64_t*)in_buf, -- *inout_uint64 = (uint64_t*)inout_buf, -+ uint64_t *in_uint64 = (uint64_t*)((char*)in_buf + op1_alignment * sizeof(uint64_t)), -+ *inout_uint64 = (uint64_t*)((char*)inout_buf + res_alignment * sizeof(uint64_t)), - *inout_uint64_for_check = (uint64_t*)inout_check_buf; - for( i = 0; i < count; i++ ) { - in_uint64[i] = 5; -@@ -626,8 +673,8 @@ int main(int argc, char **argv) - } - - if( 'f' == type[type_idx] ) { -- float *in_float = (float*)in_buf, -- *inout_float = (float*)inout_buf, -+ float *in_float = (float*)((char*)in_buf + op1_alignment * sizeof(float)), -+ *inout_float = (float*)((char*)inout_buf + res_alignment * sizeof(float)), - *inout_float_for_check = (float*)inout_check_buf; - for( i = 0; i < count; i++ ) { - in_float[i] = 1000.0+1; -@@ -658,8 +705,8 @@ int main(int argc, char **argv) - } - - if( 'd' == type[type_idx] ) { -- double *in_double = (double*)in_buf, -- *inout_double = (double*)inout_buf, -+ double *in_double = (double*)((char*)in_buf + op1_alignment * sizeof(double)), -+ *inout_double = (double*)((char*)inout_buf + res_alignment * sizeof(double)), - *inout_double_for_check = (double*)inout_check_buf; - for( i = 0; i < count; i++ ) { - in_double[i] = 10.0+1; -@@ -691,7 +738,7 @@ int main(int argc, char **argv) - check_and_continue: - if( !skip_op_type ) - print_status(array_of_ops[do_ops[op_idx]].mpi_op_name, -- mpi_type, type_size, count, tend-tstart, correctness); -+ mpi_type, type_size, count, max_shift, duration, repeats, correctness); - } - if( !skip_op_type ) - printf("\n"); diff --git a/8348.patch b/8348.patch deleted file mode 100644 index 89d5fd7..0000000 --- a/8348.patch +++ /dev/null @@ -1,25 +0,0 @@ -From 838568da9fce85b4555b0e0cbd899c8e8ef75696 Mon Sep 17 00:00:00 2001 -From: George Bosilca -Date: Wed, 6 Jan 2021 13:30:40 -0500 -Subject: [PATCH] A started generalized request should be marked as pending. - -Fixes #8340 - -Signed-off-by: George Bosilca -(cherry picked from commit 434a2515f8aab11f505b2fca0b3d8cc41e24cef2) ---- - ompi/request/grequest.c | 1 + - 1 file changed, 1 insertion(+) - -diff --git a/ompi/request/grequest.c b/ompi/request/grequest.c -index c895b4232b6..02affd642aa 100644 ---- a/ompi/request/grequest.c -+++ b/ompi/request/grequest.c -@@ -163,6 +163,7 @@ int ompi_grequest_start( - greq->greq_free.c_free = gfree_fn; - greq->greq_cancel.c_cancel = gcancel_fn; - greq->greq_base.req_status = ompi_status_empty; -+ greq->greq_base.req_complete = REQUEST_PENDING; - - *request = &greq->greq_base; - return OMPI_SUCCESS; diff --git a/openmpi.spec b/openmpi.spec index c3483ad..a53bbc5 100644 --- a/openmpi.spec +++ b/openmpi.spec @@ -26,12 +26,11 @@ %endif # Run autogen - needed for some patches -# For Patch0 %bcond_without autogen Name: openmpi%{?_cc_name_suffix} -Version: 4.1.0 -Release: 7%{?dist} +Version: 4.1.1 +Release: 2%{?dist} Summary: Open Message Passing Interface License: BSD and MIT and Romio URL: http://www.open-mpi.org/ @@ -42,11 +41,8 @@ Source1: openmpi.module.in Source2: openmpi.pth.py2 Source3: openmpi.pth.py3 Source4: macros.openmpi - -# Fix AVX library linkage -Patch0: https://patch-diff.githubusercontent.com/raw/open-mpi/ompi/pull/8322.patch -# Fix generalized requests (mpi4py test failure) -Patch1: https://patch-diff.githubusercontent.com/raw/open-mpi/ompi/pull/8348.patch +Patch1: 266189935aef4fce825d0db831b4b53accc62c32.patch +Patch2: 0001-Revert-ucx-check-supported-transports-and-devices-fo.patch BuildRequires: gcc-c++ BuildRequires: gcc-gfortran @@ -82,6 +78,9 @@ BuildRequires: perl-interpreter BuildRequires: perl(Getopt::Long) BuildRequires: pmix-devel BuildRequires: python%{python3_pkgversion}-devel +%ifarch x86_64 +BuildRequires: libpsm2-devel +%endif %if %{with ucx} BuildRequires: ucx-devel %endif @@ -91,7 +90,7 @@ BuildRequires: rpm-mpi-hooks %endif Provides: mpi -%if 0%{?rhel} +%if 0%{?rhel} == 7 # Need this for /etc/profile.d/modules.sh Requires: environment-modules %endif @@ -169,7 +168,7 @@ OpenMPI support for Python 3. %prep -%autosetup -p1 +%autosetup -p1 -n %{name}-%{version} %if %{with autogen} ./autogen.pl --force %endif @@ -358,6 +357,13 @@ make check %changelog +* Thu Jul 15 2021 Honggang Li - 4.1.1-2 +- Update to 4.1.1 +- Enable psm2 support +- fbtl-posix: link to common_ompio +- Revert upstream commit c36d7459b6331c4d +- Resolve: rhbz#1869443 + * Fri Apr 16 2021 Mohan Boddu - 4.1.0-7 - Rebuilt for RHEL 9 BETA on Apr 15th 2021. Related: rhbz#1947937 diff --git a/sources b/sources index ebb82c2..cc0a735 100644 --- a/sources +++ b/sources @@ -1 +1 @@ -SHA512 (openmpi-4.1.0.tar.bz2) = eaf086ab4929ce5a9a3e867c8315bf802ff4dc75d3f05d740e22dfd97803a4559212dacbe06920d42ac6644f46057eb6980cccf5a8b0a7df9c5bdf5bffc0b3a6 +SHA512 (openmpi-4.1.1.tar.bz2) = 0d85ba45a40c0879f266e5286615e2cf94eb3570f0a705194525821d5c85d460cefc3a2da8207e6e84c479d3d0da656e2342cc2d6f88c4b4577ca22bbeacc89d