diff --git a/.gitignore b/.gitignore
index 104cfac..5746d2b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -54,3 +54,4 @@ openmpi-1.4.1-RH.tar.bz2
 /openmpi-4.0.4.tar.bz2
 /openmpi-4.0.5.tar.bz2
 /openmpi-4.1.0.tar.bz2
+/openmpi-4.1.1.tar.bz2
diff --git a/0001-Revert-ucx-check-supported-transports-and-devices-fo.patch b/0001-Revert-ucx-check-supported-transports-and-devices-fo.patch
new file mode 100644
index 0000000..2871232
--- /dev/null
+++ b/0001-Revert-ucx-check-supported-transports-and-devices-fo.patch
@@ -0,0 +1,367 @@
+From 63c80c7692e55f634cbca6f67cc5c9cdef3a04d2 Mon Sep 17 00:00:00 2001
+From: Honggang Li <honli@redhat.com>
+Date: Mon, 28 Jun 2021 21:38:13 +0800
+Subject: [PATCH] Revert "ucx: check supported transports and devices for
+ setting priority"
+
+This reverts commit c36d7459b6331c4da825cad5a64326e7c1a272aa.
+---
+ contrib/platform/mellanox/optimized.conf |   2 -
+ ompi/mca/pml/ucx/pml_ucx_component.c     |  15 +-
+ opal/mca/common/ucx/common_ucx.c         | 202 +----------------------
+ opal/mca/common/ucx/common_ucx.h         |  15 --
+ opal/mca/common/ucx/configure.m4         |   2 -
+ 5 files changed, 2 insertions(+), 234 deletions(-)
+
+diff --git a/contrib/platform/mellanox/optimized.conf b/contrib/platform/mellanox/optimized.conf
+index 543fd8d1e224..b86b37c9e2fa 100644
+--- a/contrib/platform/mellanox/optimized.conf
++++ b/contrib/platform/mellanox/optimized.conf
+@@ -61,8 +61,6 @@
+ coll = ^ml
+ hwloc_base_binding_policy = core
+ btl = self
+-pml_ucx_tls = any
+-pml_ucx_devices = any
+ # Basic behavior to smooth startup
+ mca_base_component_show_load_errors = 0
+ orte_abort_timeout = 10
+diff --git a/ompi/mca/pml/ucx/pml_ucx_component.c b/ompi/mca/pml/ucx/pml_ucx_component.c
+index 6aed6c41d11d..ed9cc6573e8e 100644
+--- a/ompi/mca/pml/ucx/pml_ucx_component.c
++++ b/ompi/mca/pml/ucx/pml_ucx_component.c
+@@ -107,26 +107,13 @@ static mca_pml_base_module_t*
+ mca_pml_ucx_component_init(int* priority, bool enable_progress_threads,
+                            bool enable_mpi_threads)
+ {
+-    opal_common_ucx_support_level_t support_level;
+     int ret;
+ 
+-    support_level = opal_common_ucx_support_level(ompi_pml_ucx.ucp_context);
+-    if (support_level == OPAL_COMMON_UCX_SUPPORT_NONE) {
+-        return NULL;
+-    }
+-
+     if ( (ret = mca_pml_ucx_init(enable_mpi_threads)) != 0) {
+         return NULL;
+     }
+ 
+-    /*
+-     * If found supported devices - set to the configured (high) priority.
+-     * Otherwise - Found only supported transports (which could be exposed by
+-     *             unsupported devices), so set a priority lower than ob1.
+-     */
+-    *priority = (support_level == OPAL_COMMON_UCX_SUPPORT_DEVICE) ?
+-                ompi_pml_ucx.priority : 19;
+-    PML_UCX_VERBOSE(2, "returning priority %d", *priority);
++    *priority = ompi_pml_ucx.priority;
+     return &ompi_pml_ucx.super;
+ }
+ 
+diff --git a/opal/mca/common/ucx/common_ucx.c b/opal/mca/common/ucx/common_ucx.c
+index ac7a17d799a5..ae8e66877ab6 100644
+--- a/opal/mca/common/ucx/common_ucx.c
++++ b/opal/mca/common/ucx/common_ucx.c
+@@ -14,11 +14,8 @@
+ #include "opal/mca/base/mca_base_framework.h"
+ #include "opal/mca/pmix/pmix.h"
+ #include "opal/memoryhooks/memory.h"
+-#include "opal/util/argv.h"
+ 
+ #include <ucm/api/ucm.h>
+-#include <fnmatch.h>
+-#include <stdio.h>
+ 
+ /***********************************************************************/
+ 
+@@ -28,8 +25,7 @@ opal_common_ucx_module_t opal_common_ucx = {
+     .verbose             = 0,
+     .progress_iterations = 100,
+     .registered          = 0,
+-    .opal_mem_hooks      = 0,
+-    .tls                 = NULL
++    .opal_mem_hooks      = 0
+ };
+ 
+ static void opal_common_ucx_mem_release_cb(void *buf, size_t length,
+@@ -40,15 +36,10 @@ static void opal_common_ucx_mem_release_cb(void *buf, size_t length,
+ 
+ OPAL_DECLSPEC void opal_common_ucx_mca_var_register(const mca_base_component_t *component)
+ {
+-    static const char *default_tls     = "rc_verbs,ud_verbs,rc_mlx5,dc_mlx5,cuda_ipc,rocm_ipc";
+-    static const char *default_devices = "mlx*";
+     static int registered = 0;
+     static int hook_index;
+     static int verbose_index;
+     static int progress_index;
+-    static int tls_index;
+-    static int devices_index;
+-
+     if (!registered) {
+         verbose_index = mca_base_var_register("opal", "opal_common", "ucx", "verbose",
+                                               "Verbose level of the UCX components",
+@@ -69,29 +60,6 @@ OPAL_DECLSPEC void opal_common_ucx_mca_var_register(const mca_base_component_t *
+                                            OPAL_INFO_LVL_3,
+                                            MCA_BASE_VAR_SCOPE_LOCAL,
+                                            &opal_common_ucx.opal_mem_hooks);
+-
+-        opal_common_ucx.tls  = malloc(sizeof(*opal_common_ucx.tls));
+-        *opal_common_ucx.tls = strdup(default_tls);
+-        tls_index = mca_base_var_register("opal", "opal_common", "ucx", "tls",
+-                                          "List of UCX transports which should be supported on the system, to enable "
+-                                          "selecting the UCX component. Special values: any (any available). "
+-                                          "A '^' prefix negates the list. "
+-                                          "For example, in order to exclude on shared memory and TCP transports, "
+-                                          "please set to '^posix,sysv,self,tcp,cma,knem,xpmem'.",
+-                                          MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0,
+-                                          OPAL_INFO_LVL_3,
+-                                          MCA_BASE_VAR_SCOPE_LOCAL,
+-                                          opal_common_ucx.tls);
+-
+-        opal_common_ucx.devices  = malloc(sizeof(*opal_common_ucx.devices));
+-        *opal_common_ucx.devices = strdup(default_devices);
+-        devices_index = mca_base_var_register("opal", "opal_common", "ucx", "devices",
+-                                              "List of device driver pattern names, which, if supported by UCX, will "
+-                                              "bump its priority above ob1. Special values: any (any available)",
+-                                              MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0,
+-                                              OPAL_INFO_LVL_3,
+-                                              MCA_BASE_VAR_SCOPE_LOCAL,
+-                                              opal_common_ucx.devices);
+         registered = 1;
+     }
+     if (component) {
+@@ -107,14 +75,6 @@ OPAL_DECLSPEC void opal_common_ucx_mca_var_register(const mca_base_component_t *
+                                       component->mca_type_name,
+                                       component->mca_component_name,
+                                       "opal_mem_hooks", 0);
+-        mca_base_var_register_synonym(tls_index, component->mca_project_name,
+-                                      component->mca_type_name,
+-                                      component->mca_component_name,
+-                                      "tls", 0);
+-        mca_base_var_register_synonym(devices_index, component->mca_project_name,
+-                                      component->mca_type_name,
+-                                      component->mca_component_name,
+-                                      "devices", 0);
+     }
+ }
+ 
+@@ -163,166 +123,6 @@ OPAL_DECLSPEC void opal_common_ucx_mca_deregister(void)
+     opal_output_close(opal_common_ucx.output);
+ }
+ 
+-#if HAVE_DECL_OPEN_MEMSTREAM
+-static bool opal_common_ucx_check_device(const char *device_name, char **device_list)
+-{
+-    char sysfs_driver_link[PATH_MAX];
+-    char driver_path[PATH_MAX];
+-    char *ib_device_name;
+-    char *driver_name;
+-    char **list_item;
+-    ssize_t ret;
+-
+-    /* mlx5_0:1 */
+-    ret = sscanf(device_name, "%m[^:]%*d", &ib_device_name);
+-    if (ret != 1) {
+-        return false;
+-    }
+-
+-    sysfs_driver_link[sizeof(sysfs_driver_link) - 1] = '\0';
+-    snprintf(sysfs_driver_link, sizeof(sysfs_driver_link) - 1,
+-             "/sys/class/infiniband/%s/device/driver", ib_device_name);
+-    free(ib_device_name);
+-
+-    driver_path[sizeof(driver_path) - 1] = '\0';
+-    ret = readlink(sysfs_driver_link, driver_path, sizeof(driver_path) - 1);
+-    if (ret < 0) {
+-        MCA_COMMON_UCX_VERBOSE(2, "readlink(%s) failed: %s", sysfs_driver_link,
+-                               strerror(errno));
+-        return false;
+-    }
+-
+-    driver_name = basename(driver_path);
+-    for (list_item = device_list; *list_item != NULL; ++list_item) {
+-        if (!fnmatch(*list_item, driver_name, 0)) {
+-            MCA_COMMON_UCX_VERBOSE(2, "driver '%s' matched by '%s'",
+-                                   driver_path, *list_item);
+-            return true;
+-        }
+-    }
+-
+-    return false;
+-}
+-#endif
+-
+-OPAL_DECLSPEC opal_common_ucx_support_level_t
+-opal_common_ucx_support_level(ucp_context_h context)
+-{
+-    opal_common_ucx_support_level_t support_level = OPAL_COMMON_UCX_SUPPORT_NONE;
+-    static const char *support_level_names[] = {
+-        [OPAL_COMMON_UCX_SUPPORT_NONE]      = "none",
+-        [OPAL_COMMON_UCX_SUPPORT_TRANSPORT] = "transports only",
+-        [OPAL_COMMON_UCX_SUPPORT_DEVICE]    = "transports and devices"
+-    };
+-#if HAVE_DECL_OPEN_MEMSTREAM
+-    char *rsc_tl_name, *rsc_device_name;
+-    char **tl_list, **device_list, **list_item;
+-    bool is_any_tl, is_any_device;
+-    bool found_tl, negate;
+-    char line[128];
+-    FILE *stream;
+-    char *buffer;
+-    size_t size;
+-    int ret;
+-#endif
+-
+-    is_any_tl     = !strcmp(*opal_common_ucx.tls, "any");
+-    is_any_device = !strcmp(*opal_common_ucx.devices, "any");
+-
+-    /* Check for special value "any" */
+-    if (is_any_tl && is_any_device) {
+-        MCA_COMMON_UCX_VERBOSE(1, "ucx is enabled on any transport or device",
+-                               *opal_common_ucx.tls);
+-        support_level = OPAL_COMMON_UCX_SUPPORT_DEVICE;
+-        goto out;
+-    }
+-
+-#if HAVE_DECL_OPEN_MEMSTREAM
+-    /* Split transports list */
+-    negate  = ('^' == (*opal_common_ucx.tls)[0]);
+-    tl_list = opal_argv_split(*opal_common_ucx.tls + (negate ? 1 : 0), ',');
+-    if (tl_list == NULL) {
+-        MCA_COMMON_UCX_VERBOSE(1, "failed to split tl list '%s', ucx is disabled",
+-                               *opal_common_ucx.tls);
+-        goto out;
+-    }
+-
+-    /* Split devices list */
+-    device_list = opal_argv_split(*opal_common_ucx.devices, ',');
+-    if (device_list == NULL) {
+-        MCA_COMMON_UCX_VERBOSE(1, "failed to split devices list '%s', ucx is disabled",
+-                               *opal_common_ucx.devices);
+-        goto out_free_tl_list;
+-    }
+-
+-    /* Open memory stream to dump UCX information to */
+-    stream = open_memstream(&buffer, &size);
+-    if (stream == NULL) {
+-        MCA_COMMON_UCX_VERBOSE(1, "failed to open memory stream for ucx info (%s), "
+-                               "ucx is disabled", strerror(errno));
+-        goto out_free_device_list;
+-    }
+-
+-    /* Print ucx transports information to the memory stream */
+-    ucp_context_print_info(context, stream);
+-
+-    /* Rewind and read transports/devices list from the stream */
+-    fseek(stream, 0, SEEK_SET);
+-    while ((support_level != OPAL_COMMON_UCX_SUPPORT_DEVICE) &&
+-           (fgets(line, sizeof(line), stream) != NULL)) {
+-        rsc_tl_name = NULL;
+-        ret = sscanf(line,
+-                     /* "# resource 6  :  md 5  dev 4  flags -- rc_verbs/mlx5_0:1" */
+-                     "# resource %*d : md %*d dev %*d flags -- %m[^/ \n\r]/%m[^/ \n\r]",
+-                     &rsc_tl_name, &rsc_device_name);
+-        if (ret != 2) {
+-            free(rsc_tl_name);
+-            continue;
+-        }
+-
+-        /* Check if 'rsc_tl_name' is found  provided list */
+-        found_tl = is_any_tl;
+-        for (list_item = tl_list; !found_tl && (*list_item != NULL); ++list_item) {
+-            found_tl = !strcmp(*list_item, rsc_tl_name);
+-        }
+-
+-        /* Check if the transport has a match (either positive or negative) */
+-        assert(!(is_any_tl && negate));
+-        if (found_tl != negate) {
+-            if (is_any_device ||
+-                opal_common_ucx_check_device(rsc_device_name, device_list)) {
+-                MCA_COMMON_UCX_VERBOSE(2, "%s/%s: matched both transport and device list",
+-                                    rsc_tl_name, rsc_device_name);
+-                support_level = OPAL_COMMON_UCX_SUPPORT_DEVICE;
+-            } else {
+-                MCA_COMMON_UCX_VERBOSE(2, "%s/%s: matched transport list but not device list",
+-                                    rsc_tl_name, rsc_device_name);
+-                support_level = OPAL_COMMON_UCX_SUPPORT_TRANSPORT;
+-            }
+-        } else {
+-            MCA_COMMON_UCX_VERBOSE(2, "%s/%s: did not match transport list",
+-                                   rsc_tl_name, rsc_device_name);
+-        }
+-
+-        free(rsc_device_name);
+-        free(rsc_tl_name);
+-    }
+-
+-    MCA_COMMON_UCX_VERBOSE(2, "support level is %s", support_level_names[support_level]);
+-    fclose(stream);
+-    free(buffer);
+-
+-out_free_device_list:
+-    opal_argv_free(device_list);
+-out_free_tl_list:
+-    opal_argv_free(tl_list);
+-out:
+-#else
+-    MCA_COMMON_UCX_VERBOSE(2, "open_memstream() was not found, ucx is disabled");
+-#endif
+-    return support_level;
+-}
+-
+ void opal_common_ucx_empty_complete_cb(void *request, ucs_status_t status)
+ {
+ }
+diff --git a/opal/mca/common/ucx/common_ucx.h b/opal/mca/common/ucx/common_ucx.h
+index 92cdd738ef98..202131ac8907 100644
+--- a/opal/mca/common/ucx/common_ucx.h
++++ b/opal/mca/common/ucx/common_ucx.h
+@@ -88,8 +88,6 @@ typedef struct opal_common_ucx_module {
+     int  progress_iterations;
+     int  registered;
+     bool opal_mem_hooks;
+-    char **tls;
+-    char **devices;
+ } opal_common_ucx_module_t;
+ 
+ typedef struct opal_common_ucx_del_proc {
+@@ -97,23 +95,10 @@ typedef struct opal_common_ucx_del_proc {
+     size_t   vpid;
+ } opal_common_ucx_del_proc_t;
+ 
+-typedef enum {
+-    /* No supported transports found (according to configured list of supported
+-       transports) */
+-    OPAL_COMMON_UCX_SUPPORT_NONE,
+-
+-    /* Have supported transports but not supported devices */
+-    OPAL_COMMON_UCX_SUPPORT_TRANSPORT,
+-
+-    /* Have both supported transports and supported devices */
+-    OPAL_COMMON_UCX_SUPPORT_DEVICE,
+-} opal_common_ucx_support_level_t;
+-
+ extern opal_common_ucx_module_t opal_common_ucx;
+ 
+ OPAL_DECLSPEC void opal_common_ucx_mca_register(void);
+ OPAL_DECLSPEC void opal_common_ucx_mca_deregister(void);
+-OPAL_DECLSPEC opal_common_ucx_support_level_t opal_common_ucx_support_level(ucp_context_h context);
+ OPAL_DECLSPEC void opal_common_ucx_mca_proc_added(void);
+ OPAL_DECLSPEC void opal_common_ucx_empty_complete_cb(void *request, ucs_status_t status);
+ OPAL_DECLSPEC int opal_common_ucx_mca_pmix_fence(ucp_worker_h worker);
+diff --git a/opal/mca/common/ucx/configure.m4 b/opal/mca/common/ucx/configure.m4
+index af8628a889c6..27e07c2005b2 100644
+--- a/opal/mca/common/ucx/configure.m4
++++ b/opal/mca/common/ucx/configure.m4
+@@ -18,8 +18,6 @@ AC_DEFUN([MCA_opal_common_ucx_CONFIG],[
+                [common_ucx_happy="yes"],
+                [common_ucx_happy="no"])
+ 
+-    AC_CHECK_DECLS([open_memstream], [], [], [[#include <stdio.h>]])
+-
+     AS_IF([test "$common_ucx_happy" = "yes"],
+           [$1],
+           [$2])
+-- 
+2.31.1
+
diff --git a/266189935aef4fce825d0db831b4b53accc62c32.patch b/266189935aef4fce825d0db831b4b53accc62c32.patch
new file mode 100644
index 0000000..ac960e4
--- /dev/null
+++ b/266189935aef4fce825d0db831b4b53accc62c32.patch
@@ -0,0 +1,33 @@
+From 266189935aef4fce825d0db831b4b53accc62c32 Mon Sep 17 00:00:00 2001
+From: Jeff Squyres <jsquyres@cisco.com>
+Date: Tue, 22 Jun 2021 22:28:37 -0400
+Subject: [PATCH] fbtl-posix: link to common_ompio
+
+The posix fbtl calls mca_common_ompio_progress(), which resides in
+common/ompio (i.e., libmca_common_ompio.la).  So add that into
+mca_fbtl_posix_la_LIBADD (like we do in a few other OMPIO-based
+components).  Failure to do this *can* lead to the posix fbtl
+component failing to load (depending on whether other OMPIO-based
+components that pull in libmca_common_ompio were loaded first).
+
+Thanks to Honggang Li for raising the issue.
+
+Signed-off-by: Jeff Squyres <jsquyres@cisco.com>
+---
+ ompi/mca/fbtl/posix/Makefile.am | 3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+
+diff --git a/ompi/mca/fbtl/posix/Makefile.am b/ompi/mca/fbtl/posix/Makefile.am
+index a7b0624d3ec..1ce19cb09b7 100644
+--- a/ompi/mca/fbtl/posix/Makefile.am
++++ b/ompi/mca/fbtl/posix/Makefile.am
+@@ -34,7 +34,8 @@ mcacomponentdir = $(ompilibdir)
+ mcacomponent_LTLIBRARIES = $(component_install)
+ mca_fbtl_posix_la_SOURCES = $(sources)
+ mca_fbtl_posix_la_LDFLAGS = -module -avoid-version
+-mca_fbtl_posix_la_LIBADD = $(top_builddir)/ompi/lib@OMPI_LIBMPI_NAME@.la
++mca_fbtl_posix_la_LIBADD = $(top_builddir)/ompi/lib@OMPI_LIBMPI_NAME@.la \
++    $(OMPI_TOP_BUILDDIR)/ompi/mca/common/ompio/libmca_common_ompio.la
+ 
+ noinst_LTLIBRARIES = $(component_noinst)
+ libmca_fbtl_posix_la_SOURCES = $(sources)
diff --git a/8322.patch b/8322.patch
deleted file mode 100644
index 8c04e5d..0000000
--- a/8322.patch
+++ /dev/null
@@ -1,1682 +0,0 @@
-From 31068e063b8795ae11f3a59d4080db1fe111cfaf Mon Sep 17 00:00:00 2001
-From: George Bosilca <bosilca@icl.utk.edu>
-Date: Mon, 28 Dec 2020 15:36:05 -0500
-Subject: [PATCH 1/3] Major update to the AVX* detection and support
-
-1. Consistent march flag order between configure and make.
-
-2. op/avx: give the option to skip some tests
-
-it is possible to skip some intrinsic tests by setting some environment variables to "no" before invoking configure:
- - ompi_cv_op_avx_check_avx512
- - ompi_cv_op_avx_check_avx2
- - ompi_cv_op_avx_check_avx
- - ompi_cv_op_avx_check_sse41
- - ompi_cv_op_avx_check_sse3
-
-3. op/avx: update AVX512 flags
-
-try
--mavx512f -mavx512bw -mavx512vl -mavx512dq
-instead of
--march=skylake-avx512
-
-since the former is less likely to conflict with user provided CFLAGS
-(e.g. -march=...)
-
-Thanks Bart Oldeman for pointing this.
-
-4. op/avx: have the op/avx library depend on libmpi.so
-
-Refs. open-mpi/ompi#8323
-
-Signed-off-by: Gilles Gouaillardet <gilles@rist.or.jp>
-Signed-off-by: George Bosilca <bosilca@icl.utk.edu>
----
- ompi/mca/op/avx/Makefile.am  |   4 +-
- ompi/mca/op/avx/configure.m4 | 325 ++++++++++++++++++-----------------
- 2 files changed, 174 insertions(+), 155 deletions(-)
-
-diff --git a/ompi/mca/op/avx/Makefile.am b/ompi/mca/op/avx/Makefile.am
-index 41dcf2e1834..b1d84d90b33 100644
---- a/ompi/mca/op/avx/Makefile.am
-+++ b/ompi/mca/op/avx/Makefile.am
-@@ -2,7 +2,7 @@
- # Copyright (c) 2019-2020 The University of Tennessee and The University
- #                         of Tennessee Research Foundation.  All rights
- #                         reserved.
--# Copyright (c) 2020      Research Organization for Information Science
-+# Copyright (c) 2020-2021 Research Organization for Information Science
- #                         and Technology (RIST).  All rights reserved.
- # $COPYRIGHT$
- #
-@@ -86,7 +86,7 @@ mcacomponentdir = $(ompilibdir)
- mcacomponent_LTLIBRARIES = $(component_install)
- mca_op_avx_la_SOURCES = $(sources)
- mca_op_avx_la_LIBADD = $(specialized_op_libs)
--mca_op_avx_la_LDFLAGS = -module -avoid-version
-+mca_op_avx_la_LDFLAGS = -module -avoid-version $(top_builddir)/ompi/lib@OMPI_LIBMPI_NAME@.la
- 
- 
- # Specific information for static builds.
-diff --git a/ompi/mca/op/avx/configure.m4 b/ompi/mca/op/avx/configure.m4
-index 09d8b374c8e..f61b7100ef4 100644
---- a/ompi/mca/op/avx/configure.m4
-+++ b/ompi/mca/op/avx/configure.m4
-@@ -29,6 +29,13 @@ AC_DEFUN([MCA_ompi_op_avx_CONFIG],[
-     op_avx_support=0
-     op_avx2_support=0
-     op_avx512_support=0
-+
-+    AS_VAR_PUSHDEF([op_avx_check_sse3], [ompi_cv_op_avx_check_sse3])
-+    AS_VAR_PUSHDEF([op_avx_check_sse41], [ompi_cv_op_avx_check_sse41])
-+    AS_VAR_PUSHDEF([op_avx_check_avx], [ompi_cv_op_avx_check_avx])
-+    AS_VAR_PUSHDEF([op_avx_check_avx2], [ompi_cv_op_avx_check_avx2])
-+    AS_VAR_PUSHDEF([op_avx_check_avx512], [ompi_cv_op_avx_check_avx512])
-+
-     OPAL_VAR_SCOPE_PUSH([op_avx_cflags_save])
- 
-     AS_IF([test "$opal_cv_asm_arch" = "X86_64"],
-@@ -37,21 +44,9 @@ AC_DEFUN([MCA_ompi_op_avx_CONFIG],[
-            #
-            # Check for AVX512 support
-            #
--           AC_MSG_CHECKING([for AVX512 support (no additional flags)])
--           AC_LINK_IFELSE(
--               [AC_LANG_PROGRAM([[#include <immintrin.h>]],
--                                [[
--    __m512 vA, vB;
--    _mm512_add_ps(vA, vB)
--                                ]])],
--               [op_avx512_support=1
--                AC_MSG_RESULT([yes])],
--               [AC_MSG_RESULT([no])])
--
--           AS_IF([test $op_avx512_support -eq 0],
--                 [AC_MSG_CHECKING([for AVX512 support (with -march=skylake-avx512)])
--                  op_avx_cflags_save="$CFLAGS"
--                  CFLAGS="$CFLAGS -march=skylake-avx512"
-+           AC_CACHE_CHECK([if we are checking for AVX512 support], op_avx_check_avx512, AS_VAR_SET(op_avx_check_avx512, yes))
-+           AS_IF([test "$op_avx_check_avx512" = "yes"],
-+                 [AC_MSG_CHECKING([for AVX512 support (no additional flags)])
-                   AC_LINK_IFELSE(
-                       [AC_LANG_PROGRAM([[#include <immintrin.h>]],
-                                        [[
-@@ -59,99 +54,115 @@ AC_DEFUN([MCA_ompi_op_avx_CONFIG],[
-     _mm512_add_ps(vA, vB)
-                                        ]])],
-                       [op_avx512_support=1
--                       MCA_BUILD_OP_AVX512_FLAGS="-march=skylake-avx512"
-                        AC_MSG_RESULT([yes])],
-                       [AC_MSG_RESULT([no])])
--                  CFLAGS="$op_avx_cflags_save"
--                 ])
--           #
--           # Some combination of gcc and older as would not correctly build the code generated by
--           # _mm256_loadu_si256. Screen them out.
--           #
--           AS_IF([test $op_avx512_support -eq 1],
--                 [AC_MSG_CHECKING([if _mm512_loadu_si512 generates code that can be compiled])
--                  op_avx_cflags_save="$CFLAGS"
--                  CFLAGS="$CFLAGS_WITHOUT_OPTFLAGS -O0 $MCA_BUILD_OP_AVX512_FLAGS"
--                  AC_LINK_IFELSE(
--                      [AC_LANG_PROGRAM([[#include <immintrin.h>]],
--                               [[
-+
-+                  AS_IF([test $op_avx512_support -eq 0],
-+                        [AC_MSG_CHECKING([for AVX512 support (with -mavx512f -mavx512bw -mavx512vl -mavx512dq)])
-+                         op_avx_cflags_save="$CFLAGS"
-+                         CFLAGS="-mavx512f -mavx512bw -mavx512vl -mavx512dq $CFLAGS"
-+                         AC_LINK_IFELSE(
-+                             [AC_LANG_PROGRAM([[#include <immintrin.h>]],
-+                                              [[
-+    __m512 vA, vB;
-+    _mm512_add_ps(vA, vB)
-+                                       ]])],
-+                             [op_avx512_support=1
-+                              MCA_BUILD_OP_AVX512_FLAGS="-mavx512f -mavx512bw -mavx512vl -mavx512dq"
-+                              AC_MSG_RESULT([yes])],
-+                             [AC_MSG_RESULT([no])])
-+                         CFLAGS="$op_avx_cflags_save"
-+                        ])
-+                  #
-+                  # Some combination of gcc and older as would not correctly build the code generated by
-+                  # _mm256_loadu_si256. Screen them out.
-+                  #
-+                  AS_IF([test $op_avx512_support -eq 1],
-+                        [AC_MSG_CHECKING([if _mm512_loadu_si512 generates code that can be compiled])
-+                         op_avx_cflags_save="$CFLAGS"
-+                         CFLAGS="$CFLAGS_WITHOUT_OPTFLAGS -O0 $MCA_BUILD_OP_AVX512_FLAGS"
-+                         AC_LINK_IFELSE(
-+                             [AC_LANG_PROGRAM([[#include <immintrin.h>]],
-+                                      [[
-     int A[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16};
-     __m512i vA = _mm512_loadu_si512((__m512i*)&(A[1]))
--                               ]])],
--                      [AC_MSG_RESULT([yes])],
--                      [op_avx512_support=0
--                       MCA_BUILD_OP_AVX512_FLAGS=""
--                       AC_MSG_RESULT([no])])
--                  CFLAGS="$op_avx_cflags_save"
--                 ])
--           #
--           # Some PGI compilers do not define _mm512_mullo_epi64. Screen them out.
--           #
--           AS_IF([test $op_avx512_support -eq 1],
--                 [AC_MSG_CHECKING([if _mm512_mullo_epi64 generates code that can be compiled])
--                  op_avx_cflags_save="$CFLAGS"
--                  CFLAGS="$CFLAGS_WITHOUT_OPTFLAGS -O0 $MCA_BUILD_OP_AVX512_FLAGS"
--                  AC_LINK_IFELSE(
--                      [AC_LANG_PROGRAM([[#include <immintrin.h>]],
--                               [[
-+                                      ]])],
-+                             [AC_MSG_RESULT([yes])],
-+                             [op_avx512_support=0
-+                              MCA_BUILD_OP_AVX512_FLAGS=""
-+                              AC_MSG_RESULT([no])])
-+                         CFLAGS="$op_avx_cflags_save"
-+                        ])
-+                  #
-+                  # Some PGI compilers do not define _mm512_mullo_epi64. Screen them out.
-+                  #
-+                  AS_IF([test $op_avx512_support -eq 1],
-+                        [AC_MSG_CHECKING([if _mm512_mullo_epi64 generates code that can be compiled])
-+                         op_avx_cflags_save="$CFLAGS"
-+                         CFLAGS="$CFLAGS_WITHOUT_OPTFLAGS -O0 $MCA_BUILD_OP_AVX512_FLAGS"
-+                         AC_LINK_IFELSE(
-+                             [AC_LANG_PROGRAM([[#include <immintrin.h>]],
-+                                      [[
-     __m512i vA, vB;
-     _mm512_mullo_epi64(vA, vB)
--                               ]])],
--                      [AC_MSG_RESULT([yes])],
--                      [op_avx512_support=0
--                       MCA_BUILD_OP_AVX512_FLAGS=""
--                       AC_MSG_RESULT([no])])
--                  CFLAGS="$op_avx_cflags_save"
--                 ])
-+                                      ]])],
-+                             [AC_MSG_RESULT([yes])],
-+                             [op_avx512_support=0
-+                              MCA_BUILD_OP_AVX512_FLAGS=""
-+                              AC_MSG_RESULT([no])])
-+                         CFLAGS="$op_avx_cflags_save"
-+                        ])])
-            #
-            # Check support for AVX2
-            #
--           AC_MSG_CHECKING([for AVX2 support (no additional flags)])
--           AC_LINK_IFELSE(
--               [AC_LANG_PROGRAM([[#include <immintrin.h>]],
--                       [[
-+           AC_CACHE_CHECK([if we are checking for AVX2 support], op_avx_check_avx2, AS_VAR_SET(op_avx_check_avx2, yes))
-+           AS_IF([test "$op_avx_check_avx2" = "yes"],
-+                 [AC_MSG_CHECKING([for AVX2 support (no additional flags)])
-+                  AC_LINK_IFELSE(
-+                      [AC_LANG_PROGRAM([[#include <immintrin.h>]],
-+                              [[
-     __m256 vA, vB;
-     _mm256_add_ps(vA, vB)
--                       ]])],
--               [op_avx2_support=1
--                AC_MSG_RESULT([yes])],
--               [AC_MSG_RESULT([no])])
--           AS_IF([test $op_avx2_support -eq 0],
--               [AC_MSG_CHECKING([for AVX2 support (with -mavx2)])
--                op_avx_cflags_save="$CFLAGS"
--                CFLAGS="$CFLAGS -mavx2"
--                AC_LINK_IFELSE(
--                    [AC_LANG_PROGRAM([[#include <immintrin.h>]],
--                            [[
-+                              ]])],
-+                      [op_avx2_support=1
-+                       AC_MSG_RESULT([yes])],
-+                      [AC_MSG_RESULT([no])])
-+                  AS_IF([test $op_avx2_support -eq 0],
-+                      [AC_MSG_CHECKING([for AVX2 support (with -mavx2)])
-+                       op_avx_cflags_save="$CFLAGS"
-+                       CFLAGS="-mavx2 $CFLAGS"
-+                       AC_LINK_IFELSE(
-+                           [AC_LANG_PROGRAM([[#include <immintrin.h>]],
-+                                   [[
-     __m256 vA, vB;
-     _mm256_add_ps(vA, vB)
--                            ]])],
--                    [op_avx2_support=1
--                     MCA_BUILD_OP_AVX2_FLAGS="-mavx2"
--                     AC_MSG_RESULT([yes])],
--                    [AC_MSG_RESULT([no])])
--                CFLAGS="$op_avx_cflags_save"
--                ])
--           #
--           # Some combination of gcc and older as would not correctly build the code generated by
--           # _mm256_loadu_si256. Screen them out.
--           #
--           AS_IF([test $op_avx2_support -eq 1],
--                 [AC_MSG_CHECKING([if _mm256_loadu_si256 generates code that can be compiled])
--                  op_avx_cflags_save="$CFLAGS"
--                  CFLAGS="$CFLAGS_WITHOUT_OPTFLAGS -O0 $MCA_BUILD_OP_AVX2_FLAGS"
--                  AC_LINK_IFELSE(
--                      [AC_LANG_PROGRAM([[#include <immintrin.h>]],
--                               [[
-+                                   ]])],
-+                           [op_avx2_support=1
-+                            MCA_BUILD_OP_AVX2_FLAGS="-mavx2"
-+                            AC_MSG_RESULT([yes])],
-+                           [AC_MSG_RESULT([no])])
-+                       CFLAGS="$op_avx_cflags_save"
-+                       ])
-+                  #
-+                  # Some combination of gcc and older as would not correctly build the code generated by
-+                  # _mm256_loadu_si256. Screen them out.
-+                  #
-+                  AS_IF([test $op_avx2_support -eq 1],
-+                        [AC_MSG_CHECKING([if _mm256_loadu_si256 generates code that can be compiled])
-+                         op_avx_cflags_save="$CFLAGS"
-+                         CFLAGS="$CFLAGS_WITHOUT_OPTFLAGS -O0 $MCA_BUILD_OP_AVX2_FLAGS"
-+                         AC_LINK_IFELSE(
-+                             [AC_LANG_PROGRAM([[#include <immintrin.h>]],
-+                                      [[
-     int A[8] = {0, 1, 2, 3, 4, 5, 6, 7};
-     __m256i vA = _mm256_loadu_si256((__m256i*)&A)
--                               ]])],
--                      [AC_MSG_RESULT([yes])],
--                      [op_avx2_support=0
--                       MCA_BUILD_OP_AVX2_FLAGS=""
--                       AC_MSG_RESULT([no])])
--                  CFLAGS="$op_avx_cflags_save"
--                 ])
-+                                      ]])],
-+                             [AC_MSG_RESULT([yes])],
-+                             [op_avx2_support=0
-+                              MCA_BUILD_OP_AVX2_FLAGS=""
-+                              AC_MSG_RESULT([no])])
-+                         CFLAGS="$op_avx_cflags_save"
-+                        ])])
-            #
-            # What about early AVX support. The rest of the logic is slightly different as
-            # we need to include some of the SSE4.1 and SSE3 instructions. So, we first check
-@@ -160,90 +171,92 @@ AC_DEFUN([MCA_ompi_op_avx_CONFIG],[
-            # the AVX flag, and then recheck if we have support for the SSE4.1 and SSE3
-            # instructions.
-            #
--           AC_MSG_CHECKING([for AVX support (no additional flags)])
--           AC_LINK_IFELSE(
--               [AC_LANG_PROGRAM([[#include <immintrin.h>]],
--                       [[
-+           AC_CACHE_CHECK([if we are checking for AVX support], op_avx_check_avx, AS_VAR_SET(op_avx_check_avx, yes))
-+           AS_IF([test "$op_avx_check_avx" = "yes"],
-+                 [AC_MSG_CHECKING([for AVX support (no additional flags)])
-+                  AC_LINK_IFELSE(
-+                      [AC_LANG_PROGRAM([[#include <immintrin.h>]],
-+                              [[
-     __m128 vA, vB;
-     _mm_add_ps(vA, vB)
--                       ]])],
--               [op_avx_support=1
--                AC_MSG_RESULT([yes])],
--               [AC_MSG_RESULT([no])])
-+                              ]])],
-+                      [op_avx_support=1
-+                       AC_MSG_RESULT([yes])],
-+                      [AC_MSG_RESULT([no])])])
-            #
-            # Check for SSE4.1 support
-            #
--           AS_IF([test $op_avx_support -eq 1],
--               [AC_MSG_CHECKING([for SSE4.1 support])
--                AC_LINK_IFELSE(
--                    [AC_LANG_PROGRAM([[#include <immintrin.h>]],
--                            [[
-+           AC_CACHE_CHECK([if we are checking for SSE4.1 support], op_avx_check_sse41, AS_VAR_SET(op_avx_check_sse41, yes))
-+           AS_IF([test $op_avx_support -eq 1 && test "$op_avx_check_sse41" = "yes"],
-+                 [AC_MSG_CHECKING([for SSE4.1 support])
-+                  AC_LINK_IFELSE(
-+                      [AC_LANG_PROGRAM([[#include <immintrin.h>]],
-+                              [[
-     __m128i vA, vB;
-     (void)_mm_max_epi8(vA, vB)
--                            ]])],
--                    [op_sse41_support=1
--                     AC_MSG_RESULT([yes])],
--                    [AC_MSG_RESULT([no])])
--                ])
-+                              ]])],
-+                      [op_sse41_support=1
-+                       AC_MSG_RESULT([yes])],
-+                      [AC_MSG_RESULT([no])])
-+                  ])
-            #
-            # Check for SSE3 support
-            #
--           AS_IF([test $op_avx_support -eq 1],
--               [AC_MSG_CHECKING([for SSE3 support])
--                AC_LINK_IFELSE(
--                    [AC_LANG_PROGRAM([[#include <immintrin.h>]],
--                            [[
-+           AC_CACHE_CHECK([if we are checking for SSE3 support], op_avx_check_sse3, AS_VAR_SET(op_avx_check_sse3, yes))
-+           AS_IF([test $op_avx_support -eq 1 && test "$op_avx_check_sse3" = "yes"],
-+                 [AC_MSG_CHECKING([for SSE3 support])
-+                  AC_LINK_IFELSE(
-+                      [AC_LANG_PROGRAM([[#include <immintrin.h>]],
-+                              [[
-     int A[4] = {0, 1, 2, 3};
-     __m128i vA = _mm_lddqu_si128((__m128i*)&A)
--                            ]])],
--                    [op_sse3_support=1
--                     AC_MSG_RESULT([yes])],
--                    [AC_MSG_RESULT([no])])
--                ])
-+                              ]])],
-+                      [op_sse3_support=1
-+                       AC_MSG_RESULT([yes])],
-+                      [AC_MSG_RESULT([no])])
-+                  ])
-            # Second pass, do we need to add the AVX flag ?
-            AS_IF([test $op_avx_support -eq 0 || test $op_sse41_support -eq 0 || test $op_sse3_support -eq 0],
--               [AC_MSG_CHECKING([for AVX support (with -mavx)])
--                op_avx_cflags_save="$CFLAGS"
--                CFLAGS="$CFLAGS -mavx"
--                AC_LINK_IFELSE(
--                    [AC_LANG_PROGRAM([[#include <immintrin.h>]],
--                            [[
-+                 [AS_IF([test "$op_avx_check_avx" = "yes"],
-+                        [AC_MSG_CHECKING([for AVX support (with -mavx)])
-+                         op_avx_cflags_save="$CFLAGS"
-+                         CFLAGS="-mavx $CFLAGS"
-+                         AC_LINK_IFELSE(
-+                             [AC_LANG_PROGRAM([[#include <immintrin.h>]],
-+                                   [[
-     __m128 vA, vB;
-     _mm_add_ps(vA, vB)
-                             ]])],
--                    [op_avx_support=1
--                     MCA_BUILD_OP_AVX_FLAGS="-mavx"
--                     op_sse41_support=0
--                     op_sse3_support=0
--                     AC_MSG_RESULT([yes])],
--                    [AC_MSG_RESULT([no])])
-+                             [op_avx_support=1
-+                              MCA_BUILD_OP_AVX_FLAGS="-mavx"
-+                              op_sse41_support=0
-+                              op_sse3_support=0
-+                              AC_MSG_RESULT([yes])],
-+                             [AC_MSG_RESULT([no])])])
- 
--                AS_IF([test $op_sse41_support -eq 0],
--                    [AC_MSG_CHECKING([for SSE4.1 support])
--                     AC_LINK_IFELSE(
--                         [AC_LANG_PROGRAM([[#include <immintrin.h>]],
--                                 [[
-+                  AS_IF([test "$op_avx_check_sse41" = "yes" && test $op_sse41_support -eq 0],
-+                        [AC_MSG_CHECKING([for SSE4.1 support])
-+                         AC_LINK_IFELSE(
-+                             [AC_LANG_PROGRAM([[#include <immintrin.h>]],
-+                                     [[
-     __m128i vA, vB;
-     (void)_mm_max_epi8(vA, vB)
--                                 ]])],
--                         [op_sse41_support=1
--                          AC_MSG_RESULT([yes])],
--                         [AC_MSG_RESULT([no])])
--                     ])
--                AS_IF([test $op_sse3_support -eq 0],
--                    [AC_MSG_CHECKING([for SSE3 support])
--                     AC_LINK_IFELSE(
--                         [AC_LANG_PROGRAM([[#include <immintrin.h>]],
-+                                     ]])],
-+                             [op_sse41_support=1
-+                              AC_MSG_RESULT([yes])],
-+                             [AC_MSG_RESULT([no])])])
-+                  AS_IF([test "$op_avx_check_sse3" = "yes" && test $op_sse3_support -eq 0],
-+                        [AC_MSG_CHECKING([for SSE3 support])
-+                         AC_LINK_IFELSE(
-+                             [AC_LANG_PROGRAM([[#include <immintrin.h>]],
-                                  [[
-     int A[4] = {0, 1, 2, 3};
-     __m128i vA = _mm_lddqu_si128((__m128i*)&A)
-                                  ]])],
--                         [op_sse3_support=1
--                          AC_MSG_RESULT([yes])],
--                         [AC_MSG_RESULT([no])])
--                     ])
--                CFLAGS="$op_avx_cflags_save"
--               ])
-+                             [op_sse3_support=1
-+                              AC_MSG_RESULT([yes])],
-+                             [AC_MSG_RESULT([no])])])
-+                  CFLAGS="$op_avx_cflags_save"])
- 
-            AC_LANG_POP([C])
-           ])
-@@ -276,6 +289,12 @@ AC_DEFUN([MCA_ompi_op_avx_CONFIG],[
-     AC_SUBST(MCA_BUILD_OP_AVX2_FLAGS)
-     AC_SUBST(MCA_BUILD_OP_AVX_FLAGS)
- 
-+    AS_VAR_POPDEF([op_avx_check_avx512])
-+    AS_VAR_POPDEF([op_avx_check_avx2])
-+    AS_VAR_POPDEF([op_avx_check_avx])
-+    AS_VAR_POPDEF([op_avx_check_sse41])
-+    AS_VAR_POPDEF([op_avx_check_sse3])
-+
-     OPAL_VAR_SCOPE_POP
-     # Enable this component iff we have at least the most basic form of support
-     # for vectorial ISA
-
-From fcf2766a03e3c2a1001679013878209bcddd50ae Mon Sep 17 00:00:00 2001
-From: George Bosilca <bosilca@icl.utk.edu>
-Date: Mon, 28 Dec 2020 12:18:07 -0500
-Subject: [PATCH 2/3] AVX code generation improvements
-
-1. Allow fallback to a lesser AVX support during make
-
-Due to the fact that some distro restrict the compiule architecture
-during make (while not setting any restrictions during configure) we
-need to detect the target architecture also during make in order to
-restrict the code we generate.
-
-2. Add comments and better protect the arch specific code.
-
-Identify all the vectorial functions used and clasify them according to
-the neccesary hardware capabilities.
-Use these requirements to protect the code for load and stores (the rest
-of the code being automatically generated it is more difficult to
-protect).
-
-3. Correctly check for AVX* support.
-
-Signed-off-by: George Bosilca <bosilca@icl.utk.edu>
----
- ompi/mca/op/avx/configure.m4       |  28 +--
- ompi/mca/op/avx/op_avx_functions.c | 322 ++++++++++++++++++++++++-----
- 2 files changed, 288 insertions(+), 62 deletions(-)
-
-diff --git a/ompi/mca/op/avx/configure.m4 b/ompi/mca/op/avx/configure.m4
-index f61b7100ef4..f3651f09d43 100644
---- a/ompi/mca/op/avx/configure.m4
-+++ b/ompi/mca/op/avx/configure.m4
-@@ -44,7 +44,7 @@ AC_DEFUN([MCA_ompi_op_avx_CONFIG],[
-            #
-            # Check for AVX512 support
-            #
--           AC_CACHE_CHECK([if we are checking for AVX512 support], op_avx_check_avx512, AS_VAR_SET(op_avx_check_avx512, yes))
-+           AC_CACHE_CHECK([for AVX512 support], op_avx_check_avx512, AS_VAR_SET(op_avx_check_avx512, yes))
-            AS_IF([test "$op_avx_check_avx512" = "yes"],
-                  [AC_MSG_CHECKING([for AVX512 support (no additional flags)])
-                   AC_LINK_IFELSE(
-@@ -115,14 +115,14 @@ AC_DEFUN([MCA_ompi_op_avx_CONFIG],[
-            #
-            # Check support for AVX2
-            #
--           AC_CACHE_CHECK([if we are checking for AVX2 support], op_avx_check_avx2, AS_VAR_SET(op_avx_check_avx2, yes))
-+           AC_CACHE_CHECK([for AVX2 support], op_avx_check_avx2, AS_VAR_SET(op_avx_check_avx2, yes))
-            AS_IF([test "$op_avx_check_avx2" = "yes"],
-                  [AC_MSG_CHECKING([for AVX2 support (no additional flags)])
-                   AC_LINK_IFELSE(
-                       [AC_LANG_PROGRAM([[#include <immintrin.h>]],
-                               [[
--    __m256 vA, vB;
--    _mm256_add_ps(vA, vB)
-+    __m256i vA, vB, vC;
-+    vC = _mm256_and_si256(vA, vB)
-                               ]])],
-                       [op_avx2_support=1
-                        AC_MSG_RESULT([yes])],
-@@ -134,8 +134,8 @@ AC_DEFUN([MCA_ompi_op_avx_CONFIG],[
-                        AC_LINK_IFELSE(
-                            [AC_LANG_PROGRAM([[#include <immintrin.h>]],
-                                    [[
--    __m256 vA, vB;
--    _mm256_add_ps(vA, vB)
-+    __m256i vA, vB, vC;
-+    vC = _mm256_and_si256(vA, vB)
-                                    ]])],
-                            [op_avx2_support=1
-                             MCA_BUILD_OP_AVX2_FLAGS="-mavx2"
-@@ -164,21 +164,21 @@ AC_DEFUN([MCA_ompi_op_avx_CONFIG],[
-                          CFLAGS="$op_avx_cflags_save"
-                         ])])
-            #
--           # What about early AVX support. The rest of the logic is slightly different as
-+           # What about early AVX support? The rest of the logic is slightly different as
-            # we need to include some of the SSE4.1 and SSE3 instructions. So, we first check
-            # if we can compile AVX code without a flag, then we validate that we have support
-            # for the SSE4.1 and SSE3 instructions we need. If not, we check for the usage of
-            # the AVX flag, and then recheck if we have support for the SSE4.1 and SSE3
-            # instructions.
-            #
--           AC_CACHE_CHECK([if we are checking for AVX support], op_avx_check_avx, AS_VAR_SET(op_avx_check_avx, yes))
-+           AC_CACHE_CHECK([for AVX support], op_avx_check_avx, AS_VAR_SET(op_avx_check_avx, yes))
-            AS_IF([test "$op_avx_check_avx" = "yes"],
-                  [AC_MSG_CHECKING([for AVX support (no additional flags)])
-                   AC_LINK_IFELSE(
-                       [AC_LANG_PROGRAM([[#include <immintrin.h>]],
-                               [[
--    __m128 vA, vB;
--    _mm_add_ps(vA, vB)
-+    __m256 vA, vB, vC;
-+    vC = _mm256_add_ps(vA, vB)
-                               ]])],
-                       [op_avx_support=1
-                        AC_MSG_RESULT([yes])],
-@@ -186,7 +186,7 @@ AC_DEFUN([MCA_ompi_op_avx_CONFIG],[
-            #
-            # Check for SSE4.1 support
-            #
--           AC_CACHE_CHECK([if we are checking for SSE4.1 support], op_avx_check_sse41, AS_VAR_SET(op_avx_check_sse41, yes))
-+           AC_CACHE_CHECK([for SSE4.1 support], op_avx_check_sse41, AS_VAR_SET(op_avx_check_sse41, yes))
-            AS_IF([test $op_avx_support -eq 1 && test "$op_avx_check_sse41" = "yes"],
-                  [AC_MSG_CHECKING([for SSE4.1 support])
-                   AC_LINK_IFELSE(
-@@ -202,7 +202,7 @@ AC_DEFUN([MCA_ompi_op_avx_CONFIG],[
-            #
-            # Check for SSE3 support
-            #
--           AC_CACHE_CHECK([if we are checking for SSE3 support], op_avx_check_sse3, AS_VAR_SET(op_avx_check_sse3, yes))
-+           AC_CACHE_CHECK([for SSE3 support], op_avx_check_sse3, AS_VAR_SET(op_avx_check_sse3, yes))
-            AS_IF([test $op_avx_support -eq 1 && test "$op_avx_check_sse3" = "yes"],
-                  [AC_MSG_CHECKING([for SSE3 support])
-                   AC_LINK_IFELSE(
-@@ -224,8 +224,8 @@ AC_DEFUN([MCA_ompi_op_avx_CONFIG],[
-                          AC_LINK_IFELSE(
-                              [AC_LANG_PROGRAM([[#include <immintrin.h>]],
-                                    [[
--    __m128 vA, vB;
--    _mm_add_ps(vA, vB)
-+    __m256 vA, vB, vC;
-+    vC = _mm256_add_ps(vA, vB)
-                             ]])],
-                              [op_avx_support=1
-                               MCA_BUILD_OP_AVX_FLAGS="-mavx"
-diff --git a/ompi/mca/op/avx/op_avx_functions.c b/ompi/mca/op/avx/op_avx_functions.c
-index 95a9c9ab84e..ef3f0932906 100644
---- a/ompi/mca/op/avx/op_avx_functions.c
-+++ b/ompi/mca/op/avx/op_avx_functions.c
-@@ -1,5 +1,5 @@
- /*
-- * Copyright (c) 2019-2020 The University of Tennessee and The University
-+ * Copyright (c) 2019-2021 The University of Tennessee and The University
-  *                         of Tennessee Research Foundation.  All rights
-  *                         reserved.
-  * Copyright (c) 2020      Research Organization for Information Science
-@@ -24,16 +24,42 @@
- #include "ompi/mca/op/avx/op_avx.h"
- 
- #include <immintrin.h>
--
-+/**
-+ * The following logic is necessary to cope with distro maintainer's desire to change the compilation
-+ * flags after the configure step, leading to inconsistencies between what OMPI has detected and what
-+ * code can be generated during make. If we detect that the current code generation architecture has
-+ * been changed from our own setting and cannot generate the code we need (AVX512, AVX2) we fall back
-+ * to a lesser support (AVX512 -> AVX2, AVX2 -> AVX, AVX -> error out).
-+ */
- #if defined(GENERATE_AVX512_CODE)
--#define PREPEND _avx512
--#elif defined(GENERATE_AVX2_CODE)
--#define PREPEND _avx2
--#elif defined(GENERATE_AVX_CODE)
--#define PREPEND _avx
--#else
--#error This file should not be compiled in this conditions
--#endif
-+#  if defined(__AVX512BW__) && defined(__AVX512F__) && defined(__AVX512VL__)
-+#    define PREPEND _avx512
-+#  else
-+#    undef GENERATE_AVX512_CODE
-+#  endif  /* defined(__AVX512BW__) && defined(__AVX512F__) && defined(__AVX512VL__) */
-+#endif  /* defined(GENERATE_AVX512_CODE) */
-+
-+#if !defined(PREPEND) && defined(GENERATE_AVX2_CODE)
-+#  if defined(__AVX2__)
-+#    define PREPEND _avx2
-+#  else
-+#    undef GENERATE_AVX2_CODE
-+#  endif  /* defined(__AVX2__) */
-+#endif  /* !defined(PREPEND) && defined(GENERATE_AVX2_CODE) */
-+
-+#if !defined(PREPEND) && defined(GENERATE_AVX_CODE)
-+#  if defined(__AVX__)
-+#    define PREPEND _avx
-+#  endif
-+#endif  /* !defined(PREPEND) && defined(GENERATE_AVX_CODE) */
-+
-+#if !defined(PREPEND)
-+#  if OMPI_MCA_OP_HAVE_AVX512 || OMPI_MCA_OP_HAVE_AVX2
-+#    error The configure step has detected possible support for AVX512 and/or AVX2 but the compiler flags during make are too restrictive. Please disable the AVX component by adding --enable-mca-no-build=op-avx to your configure step.
-+#  else
-+#    error This file should not be compiled in this conditions. Please provide the config.log file to the OMPI developers.
-+#  endif  /* OMPI_MCA_OP_HAVE_AVX512 || OMPI_MCA_OP_HAVE_AVX2 */
-+#endif  /* !defined(PREPEND) */
- 
- /*
-  * Concatenate preprocessor tokens A and B without expanding macro definitions
-@@ -46,6 +72,102 @@
-  */
- #define OP_CONCAT(A, B) OP_CONCAT_NX(A, B)
- 
-+/*
-+ * grep -e "_mm[125][251][862]_.*(" avx512.c -o | sed 's/(//g' | sort | uniq
-+ *
-+ * https://software.intel.com/sites/landingpage/IntrinsicsGuide
-+ *
-+ * _mm_add_epi[8,16,32,64]         SSE2
-+ * _mm_add_pd                      SSE2
-+ * _mm_add_ps                      SSE
-+ * _mm_adds_epi[8,16]              SSE2
-+ * _mm_adds_epu[8,16]              SSE2
-+ * _mm_and_si128                   SSE2
-+ * _mm_lddqu_si128                 SSE3
-+ * _mm_loadu_pd                    SSE2
-+ * _mm_loadu_ps                    SSE
-+ * _mm_max_epi8                    SSE4.1
-+ * _mm_max_epi16                   SSE2
-+ * _mm_max_epi32                   SSE4.1
-+ * _mm_max_epi64                   AVX512VL + AVX512F
-+ * _mm_max_epu8                    SSE2
-+ * _mm_max_epu[16,32]              SSE4.1
-+ * _mm_max_epu64                   AVX512VL + AVX512F
-+ * _mm_max_pd                      SSE2
-+ * _mm_max_ps                      SSE
-+ * _mm_min_epi8                    SSE4.1
-+ * _mm_min_epi16                   SSE2
-+ * _mm_min_epi32                   SSE4.1
-+ * _mm_min_epi64                   AVX512VL + AVX512F
-+ * _mm_min_epu8                    SSE2
-+ * _mm_min_epu[16,32]              SSE4.1
-+ * _mm_min_epu64                   AVX512VL + AVX512F
-+ * _mm_min_pd                      SSE2
-+ * _mm_min_ps                      SSE
-+ * _mm_mul_pd                      SSE2
-+ * _mm_mul_ps                      SSE
-+ * _mm_mullo_epi16                 SSE2
-+ * _mm_mullo_epi32                 SSE4.1
-+ * _mm_mullo_epi64                 AVX512VL + AVX512DQ
-+ * _mm_or_si128                    SSE2
-+ * _mm_storeu_pd                   SSE2
-+ * _mm_storeu_ps                   SSE
-+ * _mm_storeu_si128                SSE2
-+ * _mm_xor_si128                   SSE2
-+ * _mm256_add_epi[8,16,32,64]      AVX2
-+ * _mm256_add_p[s,d]               AVX
-+ * _mm256_adds_epi[8,16]           AVX2
-+ * _mm256_adds_epu[8,16]           AVX2
-+ * _mm256_and_si256                AVX2
-+ * _mm256_loadu_p[s,d]             AVX
-+ * _mm256_loadu_si256              AVX
-+ * _mm256_max_epi[8,16,32]         AVX2
-+ * _mm256_max_epi64                AVX512VL + AVX512F
-+ * _mm256_max_epu[8,16,32]         AVX2
-+ * _mm256_max_epu64                AVX512VL + AVX512F
-+ * _mm256_max_p[s,d]               AVX
-+ * _mm256_min_epi[8,16,32]         AVX2
-+ * _mm256_min_epi64                AVX512VL + AVX512F
-+ * _mm256_min_epu[8,16,32]         AVX2
-+ * _mm256_min_epu64                AVX512VL + AVX512F
-+ * _mm256_min_p[s,d]               AVX
-+ * _mm256_mul_p[s,d]               AVX
-+ * _mm256_mullo_epi[16,32]         AVX2
-+ * _mm256_mullo_epi64              AVX512VL + AVX512DQ
-+ * _mm256_or_si256                 AVX2
-+ * _mm256_storeu_p[s,d]            AVX
-+ * _mm256_storeu_si256             AVX
-+ * _mm256_xor_si256                AVX2
-+ * _mm512_add_epi[8,16]            AVX512BW
-+ * _mm512_add_epi[32,64]           AVX512F
-+ * _mm512_add_p[s,d]               AVX512F
-+ * _mm512_adds_epi[8,16]           AVX512BW
-+ * _mm512_adds_epu[8,16]           AVX512BW
-+ * _mm512_and_si512                AVX512F
-+ * _mm512_cvtepi16_epi8            AVX512BW
-+ * _mm512_cvtepi8_epi16            AVX512BW
-+ * _mm512_loadu_p[s,d]             AVX512F
-+ * _mm512_loadu_si512              AVX512F
-+ * _mm512_max_epi[8,16]            AVX512BW
-+ * _mm512_max_epi[32,64]           AVX512F
-+ * _mm512_max_epu[8,16]            AVX512BW
-+ * _mm512_max_epu[32,64]           AVX512F
-+ * _mm512_max_p[s,d]               AVX512F
-+ * _mm512_min_epi[8,16]            AVX512BW
-+ * _mm512_min_epi[32,64]           AVX512F
-+ * _mm512_min_epu[8,16]            AVX512BW
-+ * _mm512_min_epu[32,64]           AVX512F
-+ * _mm512_min_p[s,d]               AVX512F
-+ * _mm512_mul_p[s,d]               AVX512F
-+ * _mm512_mullo_epi16              AVX512BW
-+ * _mm512_mullo_epi32              AVX512F
-+ * _mm512_mullo_epi64              AVX512DQ
-+ * _mm512_or_si512                 AVX512F
-+ * _mm512_storeu_p[s,d]            AVX512F
-+ * _mm512_storeu_si512             AVX512F
-+ * _mm512_xor_si512                AVX512F
-+ */
-+
- /*
-  * Since all the functions in this file are essentially identical, we
-  * use a macro to substitute in names and types.  The core operation
-@@ -62,13 +184,14 @@
-   (((_flag) & mca_op_avx_component.flags) == (_flag))
- 
- #if defined(GENERATE_AVX512_CODE) && defined(OMPI_MCA_OP_HAVE_AVX512) && (1 == OMPI_MCA_OP_HAVE_AVX512)
-+#if __AVX512F__
- #define OP_AVX_AVX512_FUNC(name, type_sign, type_size, type, op)               \
-     if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_AVX512F_FLAG|OMPI_OP_AVX_HAS_AVX512BW_FLAG) ) { \
-         int types_per_step = (512 / 8) / sizeof(type);                         \
-         for( ; left_over >= types_per_step; left_over -= types_per_step ) {    \
--            __m512i vecA =  _mm512_loadu_si512((__m512*)in);                   \
-+            __m512i vecA = _mm512_loadu_si512((__m512*)in);                    \
-             in += types_per_step;                                              \
--            __m512i vecB =  _mm512_loadu_si512((__m512*)out);                  \
-+            __m512i vecB = _mm512_loadu_si512((__m512*)out);                   \
-             __m512i res = _mm512_##op##_ep##type_sign##type_size(vecA, vecB);  \
-             _mm512_storeu_si512((__m512*)out, res);                            \
-             out += types_per_step;                                             \
-@@ -76,10 +199,14 @@
-         if( 0 == left_over ) return;                                           \
-     }
- #else
-+#error Target architecture lacks AVX512F support needed for _mm512_loadu_si512 and _mm512_storeu_si512
-+#endif  /* __AVX512F__ */
-+#else
- #define OP_AVX_AVX512_FUNC(name, type_sign, type_size, type, op) {}
- #endif  /* defined(OMPI_MCA_OP_HAVE_AVX512) && (1 == OMPI_MCA_OP_HAVE_AVX512) */
- 
- #if defined(GENERATE_AVX2_CODE) && defined(OMPI_MCA_OP_HAVE_AVX2) && (1 == OMPI_MCA_OP_HAVE_AVX2)
-+#if __AVX__
- #define OP_AVX_AVX2_FUNC(name, type_sign, type_size, type, op)                 \
-     if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_AVX2_FLAG | OMPI_OP_AVX_HAS_AVX_FLAG) ) {  \
-         int types_per_step = (256 / 8) / sizeof(type);  /* AVX2 */             \
-@@ -87,30 +214,37 @@
-             __m256i vecA = _mm256_loadu_si256((__m256i*)in);                   \
-             in += types_per_step;                                              \
-             __m256i vecB = _mm256_loadu_si256((__m256i*)out);                  \
--            __m256i res =  _mm256_##op##_ep##type_sign##type_size(vecA, vecB); \
-+            __m256i res = _mm256_##op##_ep##type_sign##type_size(vecA, vecB);  \
-             _mm256_storeu_si256((__m256i*)out, res);                           \
-             out += types_per_step;                                             \
-         }                                                                      \
-         if( 0 == left_over ) return;                                           \
-     }
- #else
-+#error Target architecture lacks AVX support needed for _mm256_loadu_si256 and _mm256_storeu_si256
-+#endif  /* __AVX__ */
-+#else
- #define OP_AVX_AVX2_FUNC(name, type_sign, type_size, type, op) {}
- #endif  /* defined(OMPI_MCA_OP_HAVE_AVX2) && (1 == OMPI_MCA_OP_HAVE_AVX2) */
- 
- #if defined(GENERATE_SSE3_CODE) && defined(OMPI_MCA_OP_HAVE_AVX) && (1 == OMPI_MCA_OP_HAVE_AVX)
-+#if __SSE3__
- #define OP_AVX_SSE4_1_FUNC(name, type_sign, type_size, type, op)               \
-     if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_SSE3_FLAG | OMPI_OP_AVX_HAS_SSE4_1_FLAG) ) { \
--        int types_per_step = (128 / 8) / sizeof(type);  /* AVX */              \
-+        int types_per_step = (128 / 8) / sizeof(type);                         \
-         for( ; left_over >= types_per_step; left_over -= types_per_step ) {    \
-             __m128i vecA = _mm_lddqu_si128((__m128i*)in);                      \
-             in += types_per_step;                                              \
-             __m128i vecB = _mm_lddqu_si128((__m128i*)out);                     \
--            __m128i res =  _mm_##op##_ep##type_sign##type_size(vecA, vecB);    \
-+            __m128i res = _mm_##op##_ep##type_sign##type_size(vecA, vecB);     \
-             _mm_storeu_si128((__m128i*)out, res);                              \
-             out += types_per_step;                                             \
-         }                                                                      \
-     }
- #else
-+#error Target architecture lacks SSE3 support needed for _mm_lddqu_si128 and _mm_storeu_si128
-+#endif  /* __SSE3__ */
-+#else
- #define OP_AVX_SSE4_1_FUNC(name, type_sign, type_size, type, op) {}
- #endif  /* defined(OMPI_MCA_OP_HAVE_AVX) && (1 == OMPI_MCA_OP_HAVE_AVX) */
- 
-@@ -143,12 +277,13 @@ static void OP_CONCAT(ompi_op_avx_2buff_##name##_##type,PREPEND)(const void *_in
- }
- 
- #if defined(GENERATE_AVX512_CODE) && defined(OMPI_MCA_OP_HAVE_AVX512) && (1 == OMPI_MCA_OP_HAVE_AVX512)
-+#if __AVX512BW__ && __AVX__
- #define OP_AVX_AVX512_MUL(name, type_sign, type_size, type, op)         \
-     if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_AVX512F_FLAG | OMPI_OP_AVX_HAS_AVX512BW_FLAG) ) {  \
-         int types_per_step = (256 / 8) / sizeof(type);                  \
-         for (; left_over >= types_per_step; left_over -= types_per_step) { \
--            __m256i vecA_tmp =  _mm256_loadu_si256((__m256i*)in);       \
--            __m256i vecB_tmp =  _mm256_loadu_si256((__m256i*)out);      \
-+            __m256i vecA_tmp = _mm256_loadu_si256((__m256i*)in);        \
-+            __m256i vecB_tmp = _mm256_loadu_si256((__m256i*)out);       \
-             in += types_per_step;                                       \
-             __m512i vecA = _mm512_cvtepi8_epi16(vecA_tmp);              \
-             __m512i vecB = _mm512_cvtepi8_epi16(vecB_tmp);              \
-@@ -160,6 +295,9 @@ static void OP_CONCAT(ompi_op_avx_2buff_##name##_##type,PREPEND)(const void *_in
-         if( 0 == left_over ) return;                                    \
-     }
- #else
-+#error Target architecture lacks AVX512BW and AVX support needed for _mm256_loadu_si256, _mm256_storeu_si256 and _mm512_cvtepi8_epi16
-+#endif  /* __AVX512BW__ && __AVX__ */
-+#else
- #define OP_AVX_AVX512_MUL(name, type_sign, type_size, type, op) {}
- #endif  /* defined(OMPI_MCA_OP_HAVE_AVX512) && (1 == OMPI_MCA_OP_HAVE_AVX512) */
- /**
-@@ -201,13 +339,14 @@ static void OP_CONCAT( ompi_op_avx_2buff_##name##_##type, PREPEND)(const void *_
-  *
-  */
- #if defined(GENERATE_AVX512_CODE) && defined(OMPI_MCA_OP_HAVE_AVX512) && (1 == OMPI_MCA_OP_HAVE_AVX512)
-+#if __AVX512F__
- #define OP_AVX_AVX512_BIT_FUNC(name, type_size, type, op)               \
-     if( OMPI_OP_AVX_HAS_FLAGS( OMPI_OP_AVX_HAS_AVX512F_FLAG) ) {        \
-         types_per_step = (512 / 8) / sizeof(type);                      \
-         for (; left_over >= types_per_step; left_over -= types_per_step) { \
--            __m512i vecA =  _mm512_loadu_si512((__m512i*)in);           \
-+            __m512i vecA = _mm512_loadu_si512((__m512i*)in);            \
-             in += types_per_step;                                       \
--            __m512i vecB =  _mm512_loadu_si512((__m512i*)out);          \
-+            __m512i vecB = _mm512_loadu_si512((__m512i*)out);           \
-             __m512i res = _mm512_##op##_si512(vecA, vecB);              \
-             _mm512_storeu_si512((__m512i*)out, res);                    \
-             out += types_per_step;                                      \
-@@ -215,10 +354,14 @@ static void OP_CONCAT( ompi_op_avx_2buff_##name##_##type, PREPEND)(const void *_
-         if( 0 == left_over ) return;                                    \
-     }
- #else
-+#error Target architecture lacks AVX512F support needed for _mm512_loadu_si512 and _mm512_storeu_si512
-+#endif  /* __AVX512F__ */
-+#else
- #define OP_AVX_AVX512_BIT_FUNC(name, type_size, type, op) {}
- #endif  /* defined(OMPI_MCA_OP_HAVE_AVX512) && (1 == OMPI_MCA_OP_HAVE_AVX512) */
- 
- #if defined(GENERATE_AVX2_CODE) && defined(OMPI_MCA_OP_HAVE_AVX2) && (1 == OMPI_MCA_OP_HAVE_AVX2)
-+#if __AVX__
- #define OP_AVX_AVX2_BIT_FUNC(name, type_size, type, op)                 \
-     if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_AVX2_FLAG | OMPI_OP_AVX_HAS_AVX_FLAG) ) { \
-         types_per_step = (256 / 8) / sizeof(type);                      \
-@@ -226,17 +369,21 @@ static void OP_CONCAT( ompi_op_avx_2buff_##name##_##type, PREPEND)(const void *_
-             __m256i vecA = _mm256_loadu_si256((__m256i*)in);            \
-             in += types_per_step;                                       \
-             __m256i vecB = _mm256_loadu_si256((__m256i*)out);           \
--            __m256i res =  _mm256_##op##_si256(vecA, vecB);             \
-+            __m256i res = _mm256_##op##_si256(vecA, vecB);              \
-             _mm256_storeu_si256((__m256i*)out, res);                    \
-             out += types_per_step;                                      \
-         }                                                               \
-         if( 0 == left_over ) return;                                    \
-     }
- #else
-+#error Target architecture lacks AVX support needed for _mm256_loadu_si256 and _mm256_storeu_si256
-+#endif  /* __AVX__ */
-+#else
- #define OP_AVX_AVX2_BIT_FUNC(name, type_size, type, op) {}
- #endif  /* defined(OMPI_MCA_OP_HAVE_AVX2) && (1 == OMPI_MCA_OP_HAVE_AVX2) */
- 
- #if defined(GENERATE_SSE3_CODE) && defined(OMPI_MCA_OP_HAVE_AVX) && (1 == OMPI_MCA_OP_HAVE_AVX)
-+#if __SSE3__ && __SSE2__
- #define OP_AVX_SSE3_BIT_FUNC(name, type_size, type, op)                 \
-     if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_SSE3_FLAG) ) {            \
-         types_per_step = (128 / 8) / sizeof(type);                      \
-@@ -244,12 +391,15 @@ static void OP_CONCAT( ompi_op_avx_2buff_##name##_##type, PREPEND)(const void *_
-             __m128i vecA = _mm_lddqu_si128((__m128i*)in);               \
-             in += types_per_step;                                       \
-             __m128i vecB = _mm_lddqu_si128((__m128i*)out);              \
--            __m128i res =  _mm_##op##_si128(vecA, vecB);                \
-+            __m128i res = _mm_##op##_si128(vecA, vecB);                 \
-             _mm_storeu_si128((__m128i*)out, res);                       \
-             out += types_per_step;                                      \
-         }                                                               \
-     }
- #else
-+#error Target architecture lacks SSE2 and SSE3 support needed for _mm_lddqu_si128 and _mm_storeu_si128
-+#endif  /* __SSE3__ && __SSE2__ */
-+#else
- #define OP_AVX_SSE3_BIT_FUNC(name, type_size, type, op) {}
- #endif  /* defined(OMPI_MCA_OP_HAVE_AVX) && (1 == OMPI_MCA_OP_HAVE_AVX) */
- 
-@@ -282,12 +432,13 @@ static void OP_CONCAT(ompi_op_avx_2buff_##name##_##type,PREPEND)(const void *_in
- }
- 
- #if defined(GENERATE_AVX512_CODE) && defined(OMPI_MCA_OP_HAVE_AVX512) && (1 == OMPI_MCA_OP_HAVE_AVX512)
-+#if __AVX512F__
- #define OP_AVX_AVX512_FLOAT_FUNC(op)                                    \
-     if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_AVX512F_FLAG) ) {         \
-         types_per_step = (512 / 8) / sizeof(float);                     \
-         for (; left_over >= types_per_step; left_over -= types_per_step) { \
--            __m512 vecA =  _mm512_loadu_ps((__m512*)in);                \
--            __m512 vecB =  _mm512_loadu_ps((__m512*)out);         \
-+            __m512 vecA = _mm512_loadu_ps((__m512*)in);                 \
-+            __m512 vecB = _mm512_loadu_ps((__m512*)out);                \
-             in += types_per_step;                                       \
-             __m512 res = _mm512_##op##_ps(vecA, vecB);                  \
-             _mm512_storeu_ps((__m512*)out, res);                        \
-@@ -296,28 +447,36 @@ static void OP_CONCAT(ompi_op_avx_2buff_##name##_##type,PREPEND)(const void *_in
-         if( 0 == left_over ) return;                                    \
-     }
- #else
-+#error Target architecture lacks AVX512F support needed for _mm512_loadu_ps and _mm512_storeu_ps
-+#endif  /* __AVX512F__ */
-+#else
- #define OP_AVX_AVX512_FLOAT_FUNC(op) {}
- #endif  /* defined(OMPI_MCA_OP_HAVE_AVX512) && (1 == OMPI_MCA_OP_HAVE_AVX512) */
- 
- #if defined(GENERATE_AVX2_CODE) && defined(OMPI_MCA_OP_HAVE_AVX2) && (1 == OMPI_MCA_OP_HAVE_AVX2)
-+#if __AVX__
- #define OP_AVX_AVX_FLOAT_FUNC(op)                                       \
-     if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_AVX_FLAG) ) {             \
-         types_per_step = (256 / 8) / sizeof(float);                     \
-         for( ; left_over >= types_per_step; left_over -= types_per_step ) { \
--            __m256 vecA =  _mm256_loadu_ps(in);                          \
-+            __m256 vecA = _mm256_loadu_ps(in);                          \
-             in += types_per_step;                                       \
--            __m256 vecB =  _mm256_loadu_ps(out);                         \
-+            __m256 vecB = _mm256_loadu_ps(out);                         \
-             __m256 res = _mm256_##op##_ps(vecA, vecB);                  \
--            _mm256_storeu_ps(out, res);                                  \
-+            _mm256_storeu_ps(out, res);                                 \
-             out += types_per_step;                                      \
-         }                                                               \
-         if( 0 == left_over ) return;                                    \
-     }
- #else
-+#error Target architecture lacks AVX support needed for _mm256_loadu_ps and _mm256_storeu_ps
-+#endif  /* __AVX__ */
-+#else
- #define OP_AVX_AVX_FLOAT_FUNC(op) {}
- #endif  /* defined(OMPI_MCA_OP_HAVE_AVX2) && (1 == OMPI_MCA_OP_HAVE_AVX2) */
- 
- #if defined(GENERATE_AVX_CODE) && defined(OMPI_MCA_OP_HAVE_AVX) && (1 == OMPI_MCA_OP_HAVE_AVX)
-+#if __SSE__
- #define OP_AVX_SSE_FLOAT_FUNC(op)                                       \
-     if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_SSE_FLAG) ) {             \
-         types_per_step = (128 / 8) / sizeof(float);                     \
-@@ -331,6 +490,9 @@ static void OP_CONCAT(ompi_op_avx_2buff_##name##_##type,PREPEND)(const void *_in
-         }                                                               \
-     }
- #else
-+#error Target architecture lacks SSE support needed for _mm_loadu_ps and _mm_storeu_ps
-+#endif  /* __SSE__ */
-+#else
- #define OP_AVX_SSE_FLOAT_FUNC(op) {}
- #endif  /* defined(OMPI_MCA_OP_HAVE_AVX) && (1 == OMPI_MCA_OP_HAVE_AVX) */
- 
-@@ -363,13 +525,14 @@ static void OP_CONCAT(ompi_op_avx_2buff_##op##_float,PREPEND)(const void *_in, v
- }
- 
- #if defined(GENERATE_AVX512_CODE) && defined(OMPI_MCA_OP_HAVE_AVX512) && (1 == OMPI_MCA_OP_HAVE_AVX512)
-+#if __AVX512F__
- #define OP_AVX_AVX512_DOUBLE_FUNC(op)                                   \
-     if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_AVX512F_FLAG) ) {         \
-         types_per_step = (512 / 8)  / sizeof(double);                   \
-         for (; left_over >= types_per_step; left_over -= types_per_step) { \
--            __m512d vecA =  _mm512_loadu_pd(in);                        \
-+            __m512d vecA = _mm512_loadu_pd(in);                         \
-             in += types_per_step;                                       \
--            __m512d vecB =  _mm512_loadu_pd(out);                       \
-+            __m512d vecB = _mm512_loadu_pd(out);                        \
-             __m512d res = _mm512_##op##_pd(vecA, vecB);                 \
-             _mm512_storeu_pd((out), res);                               \
-             out += types_per_step;                                      \
-@@ -377,17 +540,21 @@ static void OP_CONCAT(ompi_op_avx_2buff_##op##_float,PREPEND)(const void *_in, v
-         if( 0 == left_over ) return;                                    \
-     }
- #else
-+#error Target architecture lacks AVXF512 support needed for _mm512_loadu_pd and _mm512_storeu_pd
-+#endif  /* __AVXF512__ */
-+#else
- #define OP_AVX_AVX512_DOUBLE_FUNC(op) {}
- #endif  /* defined(OMPI_MCA_OP_HAVE_AVX512) && (1 == OMPI_MCA_OP_HAVE_AVX512) */
- 
- #if defined(GENERATE_AVX2_CODE) && defined(OMPI_MCA_OP_HAVE_AVX2) && (1 == OMPI_MCA_OP_HAVE_AVX2)
-+#if __AVX__
- #define OP_AVX_AVX_DOUBLE_FUNC(op)                                      \
-     if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_AVX_FLAG) ) {             \
-         types_per_step = (256 / 8)  / sizeof(double);                   \
-         for( ; left_over >= types_per_step; left_over -= types_per_step ) { \
--            __m256d vecA =  _mm256_loadu_pd(in);                        \
-+            __m256d vecA = _mm256_loadu_pd(in);                         \
-             in += types_per_step;                                       \
--            __m256d vecB =  _mm256_loadu_pd(out);                       \
-+            __m256d vecB = _mm256_loadu_pd(out);                        \
-             __m256d res = _mm256_##op##_pd(vecA, vecB);                 \
-             _mm256_storeu_pd(out, res);                                 \
-             out += types_per_step;                                      \
-@@ -395,10 +562,14 @@ static void OP_CONCAT(ompi_op_avx_2buff_##op##_float,PREPEND)(const void *_in, v
-         if( 0 == left_over ) return;                                    \
-       }
- #else
-+#error Target architecture lacks AVX support needed for _mm256_loadu_pd and _mm256_storeu_pd
-+#endif  /* __AVX__ */
-+#else
- #define OP_AVX_AVX_DOUBLE_FUNC(op) {}
- #endif  /* defined(OMPI_MCA_OP_HAVE_AVX2) && (1 == OMPI_MCA_OP_HAVE_AVX2) */
- 
- #if defined(GENERATE_AVX_CODE) && defined(OMPI_MCA_OP_HAVE_AVX) && (1 == OMPI_MCA_OP_HAVE_AVX)
-+#if __SSE2__
- #define OP_AVX_SSE2_DOUBLE_FUNC(op)                                     \
-     if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_SSE2_FLAG) ) {            \
-         types_per_step = (128 / 8)  / sizeof(double);                   \
-@@ -412,6 +583,9 @@ static void OP_CONCAT(ompi_op_avx_2buff_##op##_float,PREPEND)(const void *_in, v
-         }                                                               \
-     }
- #else
-+#error Target architecture lacks SSE2 support needed for _mm_loadu_pd and _mm_storeu_pd
-+#endif  /* __SSE2__ */
-+#else
- #define OP_AVX_SSE2_DOUBLE_FUNC(op) {}
- #endif  /* defined(OMPI_MCA_OP_HAVE_AVX) && (1 == OMPI_MCA_OP_HAVE_AVX) */
- 
-@@ -580,12 +754,13 @@ static void OP_CONCAT(ompi_op_avx_2buff_##op##_double,PREPEND)(const void *_in,
-  *  routines, needed for some optimizations.
-  */
- #if defined(GENERATE_AVX512_CODE) && defined(OMPI_MCA_OP_HAVE_AVX512) && (1 == OMPI_MCA_OP_HAVE_AVX512)
-+#if __AVX512F__
- #define OP_AVX_AVX512_FUNC_3(name, type_sign, type_size, type, op)      \
-     if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_AVX512F_FLAG|OMPI_OP_AVX_HAS_AVX512BW_FLAG) ) {   \
-         int types_per_step = (512 / 8) / sizeof(type);                  \
-         for (; left_over >= types_per_step; left_over -= types_per_step) { \
--            __m512i vecA =  _mm512_loadu_si512(in1);                    \
--            __m512i vecB =  _mm512_loadu_si512(in2);                    \
-+            __m512i vecA = _mm512_loadu_si512(in1);                     \
-+            __m512i vecB = _mm512_loadu_si512(in2);                     \
-             in1 += types_per_step;                                      \
-             in2 += types_per_step;                                      \
-             __m512i res = _mm512_##op##_ep##type_sign##type_size(vecA, vecB); \
-@@ -595,10 +770,14 @@ static void OP_CONCAT(ompi_op_avx_2buff_##op##_double,PREPEND)(const void *_in,
-         if( 0 == left_over ) return;                                    \
-     }
- #else
-+#error Target architecture lacks AVX512F support needed for _mm512_loadu_si512 and _mm512_storeu_si512
-+#endif  /* __AVX512F__ */
-+#else
- #define OP_AVX_AVX512_FUNC_3(name, type_sign, type_size, type, op) {}
- #endif  /* defined(OMPI_MCA_OP_HAVE_AVX512) && (1 == OMPI_MCA_OP_HAVE_AVX512) */
- 
- #if defined(GENERATE_AVX2_CODE) && defined(OMPI_MCA_OP_HAVE_AVX2) && (1 == OMPI_MCA_OP_HAVE_AVX2)
-+#if __AVX__
- #define OP_AVX_AVX2_FUNC_3(name, type_sign, type_size, type, op)        \
-     if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_AVX2_FLAG | OMPI_OP_AVX_HAS_AVX_FLAG) ) { \
-         int types_per_step = (256 / 8) / sizeof(type);                  \
-@@ -607,17 +786,21 @@ static void OP_CONCAT(ompi_op_avx_2buff_##op##_double,PREPEND)(const void *_in,
-             __m256i vecB = _mm256_loadu_si256((__m256i*)in2);           \
-             in1 += types_per_step;                                      \
-             in2 += types_per_step;                                      \
--            __m256i res =  _mm256_##op##_ep##type_sign##type_size(vecA, vecB); \
-+            __m256i res = _mm256_##op##_ep##type_sign##type_size(vecA, vecB); \
-             _mm256_storeu_si256((__m256i*)out, res);                    \
-             out += types_per_step;                                      \
-         }                                                               \
-         if( 0 == left_over ) return;                                    \
-     }
- #else
-+#error Target architecture lacks AVX support needed for _mm256_loadu_si256 and _mm256_storeu_si256
-+#endif  /* __AVX__ */
-+#else
- #define OP_AVX_AVX2_FUNC_3(name, type_sign, type_size, type, op) {}
- #endif  /* defined(OMPI_MCA_OP_HAVE_AVX2) && (1 == OMPI_MCA_OP_HAVE_AVX2) */
- 
- #if defined(GENERATE_SSE3_CODE) && defined(OMPI_MCA_OP_HAVE_SSE41) && (1 == OMPI_MCA_OP_HAVE_SSE41) && defined(OMPI_MCA_OP_HAVE_AVX) && (1 == OMPI_MCA_OP_HAVE_AVX)
-+#if __SSE3__ && __SSE2__
- #define OP_AVX_SSE4_1_FUNC_3(name, type_sign, type_size, type, op)      \
-     if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_SSE3_FLAG | OMPI_OP_AVX_HAS_SSE4_1_FLAG) ) {       \
-         int types_per_step = (128 / 8) / sizeof(type);                  \
-@@ -626,12 +809,15 @@ static void OP_CONCAT(ompi_op_avx_2buff_##op##_double,PREPEND)(const void *_in,
-             __m128i vecB = _mm_lddqu_si128((__m128i*)in2);              \
-             in1 += types_per_step;                                      \
-             in2 += types_per_step;                                      \
--            __m128i res =  _mm_##op##_ep##type_sign##type_size(vecA, vecB); \
-+            __m128i res = _mm_##op##_ep##type_sign##type_size(vecA, vecB); \
-             _mm_storeu_si128((__m128i*)out, res);                       \
-             out += types_per_step;                                      \
-         }                                                               \
-     }
- #else
-+#error Target architecture lacks SSE2 and SSE3 support needed for _mm_lddqu_si128 and _mm_storeu_si128
-+#endif  /* __SSE3__ && __SSE2__ */
-+#else
- #define OP_AVX_SSE4_1_FUNC_3(name, type_sign, type_size, type, op) {}
- #endif  /* defined(OMPI_MCA_OP_HAVE_AVX) && (1 == OMPI_MCA_OP_HAVE_AVX) */
- 
-@@ -667,12 +853,13 @@ static void OP_CONCAT(ompi_op_avx_3buff_##name##_##type,PREPEND)(const void * re
- }
- 
- #if defined(GENERATE_AVX512_CODE) && defined(OMPI_MCA_OP_HAVE_AVX512) && (1 == OMPI_MCA_OP_HAVE_AVX512)
-+#if __AVX512BW__ && __AVX__
- #define OP_AVX_AVX512_MUL_3(name, type_sign, type_size, type, op)       \
-     if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_AVX512F_FLAG | OMPI_OP_AVX_HAS_AVX512BW_FLAG) ) { \
-         int types_per_step = (256 / 8) / sizeof(type);                  \
-         for (; left_over >= types_per_step; left_over -= types_per_step) { \
--            __m256i vecA_tmp =  _mm256_loadu_si256((__m256i*)in1);      \
--            __m256i vecB_tmp =  _mm256_loadu_si256((__m256i*)in2);      \
-+            __m256i vecA_tmp = _mm256_loadu_si256((__m256i*)in1);       \
-+            __m256i vecB_tmp = _mm256_loadu_si256((__m256i*)in2);       \
-             in1 += types_per_step;                                      \
-             in2 += types_per_step;                                      \
-             __m512i vecA = _mm512_cvtepi8_epi16(vecA_tmp);              \
-@@ -685,6 +872,9 @@ static void OP_CONCAT(ompi_op_avx_3buff_##name##_##type,PREPEND)(const void * re
-         if( 0 == left_over ) return;                                    \
-   }
- #else
-+#error Target architecture lacks AVX512BW and AVX support needed for _mm256_loadu_si256, _mm256_storeu_si256 and _mm512_cvtepi8_epi16
-+#endif  /* __AVX512BW__ && __AVX__ */
-+#else
- #define OP_AVX_AVX512_MUL_3(name, type_sign, type_size, type, op) {}
- #endif  /* defined(OMPI_MCA_OP_HAVE_AVX512) && (1 == OMPI_MCA_OP_HAVE_AVX512) */
- /**
-@@ -723,12 +913,13 @@ static void OP_CONCAT(ompi_op_avx_3buff_##name##_##type,PREPEND)(const void * re
- }
- 
- #if defined(GENERATE_AVX512_CODE) && defined(OMPI_MCA_OP_HAVE_AVX512) && (1 == OMPI_MCA_OP_HAVE_AVX512)
-+#if __AVX512F__
- #define OP_AVX_AVX512_BIT_FUNC_3(name, type_size, type, op)             \
-     if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_AVX512F_FLAG) ) {         \
-         types_per_step = (512 / 8) / sizeof(type);                      \
-         for (; left_over >= types_per_step; left_over -= types_per_step) {  \
--            __m512i vecA =  _mm512_loadu_si512(in1);                    \
--            __m512i vecB =  _mm512_loadu_si512(in2);                    \
-+            __m512i vecA = _mm512_loadu_si512(in1);                     \
-+            __m512i vecB = _mm512_loadu_si512(in2);                     \
-             in1 += types_per_step;                                      \
-             in2 += types_per_step;                                      \
-             __m512i res = _mm512_##op##_si512(vecA, vecB);              \
-@@ -738,10 +929,14 @@ static void OP_CONCAT(ompi_op_avx_3buff_##name##_##type,PREPEND)(const void * re
-         if( 0 == left_over ) return;                                    \
-     }
- #else
-+#error Target architecture lacks AVX512F support needed for _mm512_loadu_si512 and _mm512_storeu_si512
-+#endif  /* __AVX512F__ */
-+#else
- #define OP_AVX_AVX512_BIT_FUNC_3(name, type_size, type, op) {}
- #endif  /* defined(OMPI_MCA_OP_HAVE_AVX512) && (1 == OMPI_MCA_OP_HAVE_AVX512) */
- 
- #if defined(GENERATE_AVX2_CODE) && defined(OMPI_MCA_OP_HAVE_AVX2) && (1 == OMPI_MCA_OP_HAVE_AVX2)
-+#if __AVX__
- #define OP_AVX_AVX2_BIT_FUNC_3(name, type_size, type, op)               \
-     if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_AVX2_FLAG | OMPI_OP_AVX_HAS_AVX_FLAG) ) { \
-         types_per_step = (256 / 8) / sizeof(type);                      \
-@@ -750,17 +945,21 @@ static void OP_CONCAT(ompi_op_avx_3buff_##name##_##type,PREPEND)(const void * re
-             __m256i vecB = _mm256_loadu_si256((__m256i*)in2);           \
-             in1 += types_per_step;                                      \
-             in2 += types_per_step;                                      \
--            __m256i res =  _mm256_##op##_si256(vecA, vecB);             \
-+            __m256i res = _mm256_##op##_si256(vecA, vecB);              \
-             _mm256_storeu_si256((__m256i*)out, res);                    \
-             out += types_per_step;                                      \
-         }                                                               \
-         if( 0 == left_over ) return;                                    \
-     }
- #else
-+#error Target architecture lacks AVX support needed for _mm256_loadu_si256 and _mm256_storeu_si256
-+#endif  /* __AVX__ */
-+#else
- #define OP_AVX_AVX2_BIT_FUNC_3(name, type_size, type, op) {}
- #endif  /* defined(OMPI_MCA_OP_HAVE_AVX2) && (1 == OMPI_MCA_OP_HAVE_AVX2) */
- 
- #if defined(GENERATE_SSE3_CODE) && defined(OMPI_MCA_OP_HAVE_AVX) && (1 == OMPI_MCA_OP_HAVE_AVX)
-+#if __SSE3__ && __SSE2__
- #define OP_AVX_SSE3_BIT_FUNC_3(name, type_size, type, op)               \
-     if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_SSE3_FLAG) ) {            \
-         types_per_step = (128 / 8) / sizeof(type);                      \
-@@ -769,12 +968,15 @@ static void OP_CONCAT(ompi_op_avx_3buff_##name##_##type,PREPEND)(const void * re
-             __m128i vecB = _mm_lddqu_si128((__m128i*)in2);              \
-             in1 += types_per_step;                                      \
-             in2 += types_per_step;                                      \
--            __m128i res =  _mm_##op##_si128(vecA, vecB);                \
-+            __m128i res = _mm_##op##_si128(vecA, vecB);                 \
-             _mm_storeu_si128((__m128i*)out, res);                       \
-             out += types_per_step;                                      \
-         }                                                               \
-     }
- #else
-+#error Target architecture lacks SSE2 and SSE3 support needed for _mm_lddqu_si128 and _mm_storeu_si128
-+#endif  /* __SSE3__ && __SSE2__ */
-+#else
- #define OP_AVX_SSE3_BIT_FUNC_3(name, type_size, type, op) {}
- #endif  /* defined(OMPI_MCA_OP_HAVE_AVX) && (1 == OMPI_MCA_OP_HAVE_AVX) */
- 
-@@ -809,12 +1011,13 @@ static void OP_CONCAT(ompi_op_avx_3buff_##op##_##type,PREPEND)(const void *_in1,
- }
- 
- #if defined(GENERATE_AVX512_CODE) && defined(OMPI_MCA_OP_HAVE_AVX512) && (1 == OMPI_MCA_OP_HAVE_AVX512)
-+#if __AVX512F__
- #define OP_AVX_AVX512_FLOAT_FUNC_3(op)                                  \
-     if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_AVX512F_FLAG) ) {         \
-         types_per_step = (512 / 8) / sizeof(float);                     \
-         for (; left_over >= types_per_step; left_over -= types_per_step) { \
--            __m512 vecA =  _mm512_loadu_ps(in1);                        \
--            __m512 vecB =  _mm512_loadu_ps(in2);                        \
-+            __m512 vecA = _mm512_loadu_ps(in1);                         \
-+            __m512 vecB = _mm512_loadu_ps(in2);                         \
-             in1 += types_per_step;                                      \
-             in2 += types_per_step;                                      \
-             __m512 res = _mm512_##op##_ps(vecA, vecB);                  \
-@@ -824,16 +1027,20 @@ static void OP_CONCAT(ompi_op_avx_3buff_##op##_##type,PREPEND)(const void *_in1,
-         if( 0 == left_over ) return;                                    \
-     }
- #else
-+#error Target architecture lacks AVX512F support needed for _mm512_loadu_ps and _mm512_storeu_ps
-+#endif  /* __AVX512F__ */
-+#else
- #define OP_AVX_AVX512_FLOAT_FUNC_3(op) {}
- #endif  /* defined(OMPI_MCA_OP_HAVE_AVX512) && (1 == OMPI_MCA_OP_HAVE_AVX512) */
- 
- #if defined(GENERATE_AVX2_CODE) && defined(OMPI_MCA_OP_HAVE_AVX2) && (1 == OMPI_MCA_OP_HAVE_AVX2)
-+#if __AVX__
- #define OP_AVX_AVX_FLOAT_FUNC_3(op)                                     \
-     if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_AVX_FLAG) ) {             \
-         types_per_step = (256 / 8) / sizeof(float);                     \
-         for( ; left_over >= types_per_step; left_over -= types_per_step ) { \
--            __m256 vecA =  _mm256_loadu_ps(in1);                        \
--            __m256 vecB =  _mm256_loadu_ps(in2);                        \
-+            __m256 vecA = _mm256_loadu_ps(in1);                         \
-+            __m256 vecB = _mm256_loadu_ps(in2);                         \
-             in1 += types_per_step;                                      \
-             in2 += types_per_step;                                      \
-             __m256 res = _mm256_##op##_ps(vecA, vecB);                  \
-@@ -843,10 +1050,14 @@ static void OP_CONCAT(ompi_op_avx_3buff_##op##_##type,PREPEND)(const void *_in1,
-         if( 0 == left_over ) return;                                    \
-     }
- #else
-+#error Target architecture lacks AVX support needed for _mm256_loadu_ps and _mm256_storeu_ps
-+#endif  /* __AVX__ */
-+#else
- #define OP_AVX_AVX_FLOAT_FUNC_3(op) {}
- #endif  /* defined(OMPI_MCA_OP_HAVE_AVX2) && (1 == OMPI_MCA_OP_HAVE_AVX2) */
- 
- #if defined(GENERATE_AVX_CODE) && defined(OMPI_MCA_OP_HAVE_AVX) && (1 == OMPI_MCA_OP_HAVE_AVX)
-+#if __SSE__
- #define OP_AVX_SSE_FLOAT_FUNC_3(op)                  \
-     if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_SSE_FLAG) ) {             \
-         types_per_step = (128 / 8) / sizeof(float);                     \
-@@ -861,6 +1072,9 @@ static void OP_CONCAT(ompi_op_avx_3buff_##op##_##type,PREPEND)(const void *_in1,
-         }                                                               \
-     }
- #else
-+#error Target architecture lacks SSE support needed for _mm_loadu_ps and _mm_storeu_ps
-+#endif  /* __SSE__ */
-+#else
- #define OP_AVX_SSE_FLOAT_FUNC_3(op) {}
- #endif  /* defined(OMPI_MCA_OP_HAVE_AVX) && (1 == OMPI_MCA_OP_HAVE_AVX) */
- 
-@@ -895,12 +1109,13 @@ static void OP_CONCAT(ompi_op_avx_3buff_##op##_float,PREPEND)(const void *_in1,
- }
- 
- #if defined(GENERATE_AVX512_CODE) && defined(OMPI_MCA_OP_HAVE_AVX512) && (1 == OMPI_MCA_OP_HAVE_AVX512)
-+#if __AVX512F__
- #define OP_AVX_AVX512_DOUBLE_FUNC_3(op)                                 \
-     if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_AVX512F_FLAG) ) {         \
-         types_per_step = (512 / 8) / sizeof(double);                    \
-         for (; left_over >= types_per_step; left_over -= types_per_step) { \
--            __m512d vecA =  _mm512_loadu_pd((in1));                     \
--            __m512d vecB =  _mm512_loadu_pd((in2));                     \
-+            __m512d vecA = _mm512_loadu_pd((in1));                      \
-+            __m512d vecB = _mm512_loadu_pd((in2));                      \
-             in1 += types_per_step;                                      \
-             in2 += types_per_step;                                      \
-             __m512d res = _mm512_##op##_pd(vecA, vecB);                 \
-@@ -910,16 +1125,20 @@ static void OP_CONCAT(ompi_op_avx_3buff_##op##_float,PREPEND)(const void *_in1,
-         if( 0 == left_over ) return;                                    \
-     }
- #else
-+#error Target architecture lacks AVXF512 support needed for _mm512_loadu_pd and _mm512_storeu_pd
-+#endif  /* __AVXF512__ */
-+#else
- #define OP_AVX_AVX512_DOUBLE_FUNC_3(op) {}
- #endif  /* defined(OMPI_MCA_OP_HAVE_AVX512) && (1 == OMPI_MCA_OP_HAVE_AVX512) */
- 
- #if defined(GENERATE_AVX2_CODE) && defined(OMPI_MCA_OP_HAVE_AVX2) && (1 == OMPI_MCA_OP_HAVE_AVX2)
-+#if __AVX__
- #define OP_AVX_AVX_DOUBLE_FUNC_3(op)                                    \
-     if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_AVX_FLAG) ) {             \
-         types_per_step = (256 / 8) / sizeof(double);                    \
-         for( ; left_over >= types_per_step; left_over -= types_per_step ) { \
--            __m256d vecA =  _mm256_loadu_pd(in1);                       \
--            __m256d vecB =  _mm256_loadu_pd(in2);                       \
-+            __m256d vecA = _mm256_loadu_pd(in1);                        \
-+            __m256d vecB = _mm256_loadu_pd(in2);                        \
-             in1 += types_per_step;                                      \
-             in2 += types_per_step;                                      \
-             __m256d res = _mm256_##op##_pd(vecA, vecB);                 \
-@@ -929,10 +1148,14 @@ static void OP_CONCAT(ompi_op_avx_3buff_##op##_float,PREPEND)(const void *_in1,
-         if( 0 == left_over ) return;                                    \
-     }
- #else
-+#error Target architecture lacks AVX support needed for _mm256_loadu_pd and _mm256_storeu_pd
-+#endif  /* __AVX__ */
-+#else
- #define OP_AVX_AVX_DOUBLE_FUNC_3(op) {}
- #endif  /* defined(OMPI_MCA_OP_HAVE_AVX2) && (1 == OMPI_MCA_OP_HAVE_AVX2) */
- 
- #if defined(GENERATE_AVX_CODE) && defined(OMPI_MCA_OP_HAVE_AVX) && (1 == OMPI_MCA_OP_HAVE_AVX)
-+#if __SSE2__
- #define OP_AVX_SSE2_DOUBLE_FUNC_3(op)                                   \
-     if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_SSE2_FLAG) ) {            \
-         types_per_step = (128 / 8) / sizeof(double);                    \
-@@ -947,6 +1170,9 @@ static void OP_CONCAT(ompi_op_avx_3buff_##op##_float,PREPEND)(const void *_in1,
-         }                                                               \
-     }
- #else
-+#error Target architecture lacks SSE2 support needed for _mm_loadu_pd and _mm_storeu_pd
-+#endif  /* __SSE2__ */
-+#else
- #define OP_AVX_SSE2_DOUBLE_FUNC_3(op) {}
- #endif  /* defined(OMPI_MCA_OP_HAVE_AVX) && (1 == OMPI_MCA_OP_HAVE_AVX) */
- 
-
-From 20be3fc25713ac2de3eb4d77b85248d7fe2bc28b Mon Sep 17 00:00:00 2001
-From: George Bosilca <bosilca@icl.utk.edu>
-Date: Tue, 5 Jan 2021 22:40:26 -0500
-Subject: [PATCH 3/3] A better test for MPI_OP performance.
-
-The test now has the ability to add a shift to all or to any of the
-input and output buffers to assess the impact of unaligned operations.
-
-Signed-off-by: George Bosilca <bosilca@icl.utk.edu>
----
- test/datatype/reduce_local.c | 161 ++++++++++++++++++++++-------------
- 1 file changed, 104 insertions(+), 57 deletions(-)
-
-diff --git a/test/datatype/reduce_local.c b/test/datatype/reduce_local.c
-index 97890f94227..f227439b714 100644
---- a/test/datatype/reduce_local.c
-+++ b/test/datatype/reduce_local.c
-@@ -59,7 +59,7 @@ static int total_errors = 0;
-      _a < _b ? _a : _b; })
- 
- static void print_status(char* op, char* type, int type_size,
--                         int count, double duration,
-+                         int count, int max_shift, double *duration, int repeats,
-                          int correct )
- {
-     if(correct) {
-@@ -68,7 +68,15 @@ static void print_status(char* op, char* type, int type_size,
-         printf("%-10s %s [\033[1;31mfail\033[0m]", op, type);
-         total_errors++;
-     }
--    printf(" count  %-10d  time %.6f seconds\n", count, duration);
-+    if( 1 == max_shift ) {
-+        printf(" count  %-10d  time (seconds) %.8f seconds\n", count, duration[0] / repeats);
-+    } else {
-+        printf(" count  %-10d  time (seconds / shifts) ", count);
-+        for( int i = 0; i < max_shift; i++ ) {
-+            printf("%.8f ", duration[i] / repeats );
-+        }
-+        printf("\n");
-+    }
- }
- 
- static int do_ops_built = 0;
-@@ -115,19 +123,23 @@ do { \
-     const TYPE *_p1 = ((TYPE*)(INBUF)), *_p3 = ((TYPE*)(CHECK_BUF)); \
-     TYPE *_p2 = ((TYPE*)(INOUT_BUF)); \
-     skip_op_type = 0; \
--    for(int _k = 0; _k < min((COUNT), 4); +_k++ ) { \
--        memcpy(_p2, _p3, sizeof(TYPE) * (COUNT)); \
--        tstart = MPI_Wtime(); \
--        MPI_Reduce_local(_p1+_k, _p2+_k, (COUNT)-_k, (MPITYPE), (MPIOP)); \
--        tend = MPI_Wtime(); \
--        if( check ) { \
--            for( i = 0; i < (COUNT)-_k; i++ ) { \
--                if(((_p2+_k)[i]) == (((_p1+_k)[i]) OPNAME ((_p3+_k)[i]))) \
--                    continue; \
--                printf("First error at alignment %d position %d (%" TYPE_PREFIX " %s %" TYPE_PREFIX " != %" TYPE_PREFIX ")\n", \
--                       _k, i, (_p1+_k)[i], (#OPNAME), (_p3+_k)[i], (_p2+_k)[i]); \
--                correctness = 0; \
--                break; \
-+    for(int _k = 0; _k < min((COUNT), max_shift); +_k++ ) { \
-+        duration[_k] = 0.0; \
-+        for(int _r = repeats; _r > 0; _r--) { \
-+            memcpy(_p2, _p3, sizeof(TYPE) * (COUNT)); \
-+            tstart = MPI_Wtime(); \
-+            MPI_Reduce_local(_p1+_k, _p2+_k, (COUNT)-_k, (MPITYPE), (MPIOP)); \
-+            tend = MPI_Wtime(); \
-+            duration[_k] += (tend - tstart); \
-+            if( check ) { \
-+                for( i = 0; i < (COUNT)-_k; i++ ) { \
-+                    if(((_p2+_k)[i]) == (((_p1+_k)[i]) OPNAME ((_p3+_k)[i]))) \
-+                        continue; \
-+                    printf("First error at alignment %d position %d (%" TYPE_PREFIX " %s %" TYPE_PREFIX " != %" TYPE_PREFIX ")\n", \
-+                           _k, i, (_p1+_k)[i], (#OPNAME), (_p3+_k)[i], (_p2+_k)[i]); \
-+                    correctness = 0; \
-+                    break; \
-+                } \
-             } \
-         } \
-     } \
-@@ -139,20 +151,24 @@ do { \
-     const TYPE *_p1 = ((TYPE*)(INBUF)), *_p3 = ((TYPE*)(CHECK_BUF)); \
-     TYPE *_p2 = ((TYPE*)(INOUT_BUF)); \
-     skip_op_type = 0; \
--    for(int _k = 0; _k < min((COUNT), 4); +_k++ ) { \
--        memcpy(_p2, _p3, sizeof(TYPE) * (COUNT)); \
--        tstart = MPI_Wtime(); \
--        MPI_Reduce_local(_p1+_k, _p2+_k, (COUNT), (MPITYPE), (MPIOP)); \
--        tend = MPI_Wtime(); \
--        if( check ) { \
--            for( i = 0; i < (COUNT); i++ ) { \
--                TYPE _v1 = *(_p1+_k), _v2 = *(_p2+_k), _v3 = *(_p3+_k); \
--                if(_v2 == OPNAME(_v1, _v3)) \
--                    continue; \
--                printf("First error at alignment %d position %d (%" TYPE_PREFIX " !=  %s(%" TYPE_PREFIX ", %" TYPE_PREFIX ")\n", \
--                       _k, i, _v1, (#OPNAME), _v3, _v2); \
--                correctness = 0; \
--                break; \
-+    for(int _k = 0; _k < min((COUNT), max_shift); +_k++ ) { \
-+        duration[_k] = 0.0; \
-+        for(int _r = repeats; _r > 0; _r--) { \
-+            memcpy(_p2, _p3, sizeof(TYPE) * (COUNT)); \
-+            tstart = MPI_Wtime(); \
-+            MPI_Reduce_local(_p1+_k, _p2+_k, (COUNT), (MPITYPE), (MPIOP)); \
-+            tend = MPI_Wtime(); \
-+            duration[_k] += (tend - tstart); \
-+            if( check ) { \
-+                for( i = 0; i < (COUNT); i++ ) { \
-+                    TYPE _v1 = *(_p1+_k), _v2 = *(_p2+_k), _v3 = *(_p3+_k); \
-+                    if(_v2 == OPNAME(_v1, _v3)) \
-+                        continue; \
-+                    printf("First error at alignment %d position %d (%" TYPE_PREFIX " !=  %s(%" TYPE_PREFIX ", %" TYPE_PREFIX ")\n", \
-+                           _k, i, _v1, (#OPNAME), _v3, _v2); \
-+                    correctness = 0; \
-+                    break; \
-+                } \
-             } \
-         } \
-     } \
-@@ -163,24 +179,36 @@ int main(int argc, char **argv)
- {
-     static void *in_buf = NULL, *inout_buf = NULL, *inout_check_buf = NULL;
-     int count, type_size = 8, rank, size, provided, correctness = 1;
--    int repeats = 1, i, c;
--    double tstart, tend;
-+    int repeats = 1, i, c, op1_alignment = 0, res_alignment = 0;
-+    int max_shift = 4;
-+    double *duration, tstart, tend;
-     bool check = true;
-     char type[5] = "uifd", *op = "sum", *mpi_type;
-     int lower = 1, upper = 1000000, skip_op_type;
-     MPI_Op mpi_op;
- 
--    while( -1 != (c = getopt(argc, argv, "l:u:t:o:s:n:vfh")) ) {
-+    while( -1 != (c = getopt(argc, argv, "l:u:r:t:o:i:s:n:1:2:vfh")) ) {
-         switch(c) {
-         case 'l':
-             lower = atoi(optarg);
-             if( lower <= 0 ) {
--                fprintf(stderr, "The number of elements must be positive\n");
-+                fprintf(stderr, "The lower number of elements must be positive\n");
-                 exit(-1);
-             }
-             break;
-         case 'u':
-             upper = atoi(optarg);
-+            if( lower <= 0 ) {
-+                fprintf(stderr, "The upper number of elements must be positive\n");
-+                exit(-1);
-+            }
-+            break;
-+        case 'i':
-+            max_shift = atoi(optarg);
-+            if( max_shift <= 0 ) {
-+                fprintf(stderr, "The max shift must be positive\n");
-+                exit(-1);
-+            }
-             break;
-         case 'f':
-             check = false;
-@@ -216,14 +244,32 @@ int main(int argc, char **argv)
-                 exit(-1);
-             }
-             break;
-+        case '1':
-+            op1_alignment = atoi(optarg);
-+            if( op1_alignment < 0 ) {
-+                fprintf(stderr, "alignment for the first operand must be positive\n");
-+                exit(-1);
-+            }
-+            break;
-+        case '2':
-+            res_alignment = atoi(optarg);
-+            if( res_alignment < 0 ) {
-+                fprintf(stderr, "alignment for the result must be positive\n");
-+                exit(-1);
-+            }
-+            break;
-         case 'h':
-             fprintf(stdout, "%s options are:\n"
-                     " -l <number> : lower number of elements\n"
-                     " -u <number> : upper number of elements\n"
-                     " -s <type_size> : 8, 16, 32 or 64 bits elements\n"
-                     " -t [i,u,f,d] : type of the elements to apply the operations on\n"
-+                    " -r <number> : number of repetitions for each test\n"
-                     " -o <op> : comma separated list of operations to execute among\n"
-                     "           sum, min, max, prod, bor, bxor, band\n"
-+                    " -i <number> : shift on all buffers to check alignment\n"
-+                    " -1 <number> : (mis)alignment in elements for the first op\n"
-+                    " -2 <number> : (mis)alignment in elements for the result\n"
-                     " -v: increase the verbosity level\n"
-                     " -h: this help message\n", argv[0]);
-             exit(0);
-@@ -233,9 +279,10 @@ int main(int argc, char **argv)
-     if( !do_ops_built ) {  /* not yet done, take the default */
-             build_do_ops( "all", do_ops);
-     }
--    in_buf          = malloc(upper * sizeof(double));
--    inout_buf       = malloc(upper * sizeof(double));
--    inout_check_buf = malloc(upper * sizeof(double));
-+    posix_memalign( &in_buf,          64, (upper + op1_alignment) * sizeof(double));
-+    posix_memalign( &inout_buf,       64, (upper + res_alignment) * sizeof(double));
-+    posix_memalign( &inout_check_buf, 64, upper * sizeof(double));
-+    duration = (double*)malloc(max_shift * sizeof(double));
- 
-     ompi_mpi_init(argc, argv, MPI_THREAD_SERIALIZED, &provided, false);
- 
-@@ -253,8 +300,8 @@ int main(int argc, char **argv)
-                 correctness = 1;
-                 if('i' == type[type_idx]) {
-                     if( 8 == type_size ) {
--                        int8_t *in_int8 = (int8_t*)in_buf,
--                            *inout_int8 = (int8_t*)inout_buf,
-+                        int8_t *in_int8 = (int8_t*)((char*)in_buf + op1_alignment * sizeof(int8_t)),
-+                            *inout_int8 = (int8_t*)((char*)inout_buf + res_alignment * sizeof(int8_t)),
-                             *inout_int8_for_check = (int8_t*)inout_check_buf;
-                         for( i = 0; i < count; i++ ) {
-                             in_int8[i] = 5;
-@@ -299,8 +346,8 @@ int main(int argc, char **argv)
-                         }
-                     }
-                     if( 16 == type_size ) {
--                        int16_t *in_int16 = (int16_t*)in_buf,
--                            *inout_int16 = (int16_t*)inout_buf,
-+                        int16_t *in_int16 = (int16_t*)((char*)in_buf + op1_alignment * sizeof(int16_t)),
-+                            *inout_int16 = (int16_t*)((char*)inout_buf + res_alignment * sizeof(int16_t)),
-                             *inout_int16_for_check = (int16_t*)inout_check_buf;
-                         for( i = 0; i < count; i++ ) {
-                             in_int16[i] = 5;
-@@ -345,8 +392,8 @@ int main(int argc, char **argv)
-                         }
-                     }
-                     if( 32 == type_size ) {
--                        int32_t *in_int32 = (int32_t*)in_buf,
--                            *inout_int32 = (int32_t*)inout_buf,
-+                        int32_t *in_int32 = (int32_t*)((char*)in_buf + op1_alignment * sizeof(int32_t)),
-+                            *inout_int32 = (int32_t*)((char*)inout_buf + res_alignment * sizeof(int32_t)),
-                             *inout_int32_for_check = (int32_t*)inout_check_buf;
-                         for( i = 0; i < count; i++ ) {
-                             in_int32[i] = 5;
-@@ -391,8 +438,8 @@ int main(int argc, char **argv)
-                         }
-                     }
-                     if( 64 == type_size ) {
--                        int64_t *in_int64 = (int64_t*)in_buf,
--                            *inout_int64 = (int64_t*)inout_buf,
-+                        int64_t *in_int64 = (int64_t*)((char*)in_buf + op1_alignment * sizeof(int64_t)),
-+                            *inout_int64 = (int64_t*)((char*)inout_buf + res_alignment * sizeof(int64_t)),
-                             *inout_int64_for_check = (int64_t*)inout_check_buf;
-                         for( i = 0; i < count; i++ ) {
-                             in_int64[i] = 5;
-@@ -440,8 +487,8 @@ int main(int argc, char **argv)
- 
-                 if( 'u' == type[type_idx] ) {
-                     if( 8 == type_size ) {
--                        uint8_t *in_uint8 = (uint8_t*)in_buf,
--                            *inout_uint8 = (uint8_t*)inout_buf,
-+                        uint8_t *in_uint8 = (uint8_t*)((char*)in_buf + op1_alignment * sizeof(uint8_t)),
-+                            *inout_uint8 = (uint8_t*)((char*)inout_buf + res_alignment * sizeof(uint8_t)),
-                             *inout_uint8_for_check = (uint8_t*)inout_check_buf;
-                         for( i = 0; i < count; i++ ) {
-                             in_uint8[i] = 5;
-@@ -486,8 +533,8 @@ int main(int argc, char **argv)
-                         }
-                     }
-                     if( 16 == type_size ) {
--                        uint16_t *in_uint16 = (uint16_t*)in_buf,
--                            *inout_uint16 = (uint16_t*)inout_buf,
-+                        uint16_t *in_uint16 = (uint16_t*)((char*)in_buf + op1_alignment * sizeof(uint16_t)),
-+                            *inout_uint16 = (uint16_t*)((char*)inout_buf + res_alignment * sizeof(uint16_t)),
-                             *inout_uint16_for_check = (uint16_t*)inout_check_buf;
-                         for( i = 0; i < count; i++ ) {
-                             in_uint16[i] = 5;
-@@ -532,8 +579,8 @@ int main(int argc, char **argv)
-                         }
-                     }
-                     if( 32 == type_size ) {
--                        uint32_t *in_uint32 = (uint32_t*)in_buf,
--                            *inout_uint32 = (uint32_t*)inout_buf,
-+                        uint32_t *in_uint32 = (uint32_t*)((char*)in_buf + op1_alignment * sizeof(uint32_t)),
-+                            *inout_uint32 = (uint32_t*)((char*)inout_buf + res_alignment * sizeof(uint32_t)),
-                             *inout_uint32_for_check = (uint32_t*)inout_check_buf;
-                         for( i = 0; i < count; i++ ) {
-                             in_uint32[i] = 5;
-@@ -578,8 +625,8 @@ int main(int argc, char **argv)
-                         }
-                     }
-                     if( 64 == type_size ) {
--                        uint64_t *in_uint64 = (uint64_t*)in_buf,
--                              *inout_uint64 = (uint64_t*)inout_buf,
-+                        uint64_t *in_uint64 = (uint64_t*)((char*)in_buf + op1_alignment * sizeof(uint64_t)),
-+                              *inout_uint64 = (uint64_t*)((char*)inout_buf + res_alignment * sizeof(uint64_t)),
-                             *inout_uint64_for_check = (uint64_t*)inout_check_buf;
-                         for( i = 0; i < count; i++ ) {
-                             in_uint64[i] = 5;
-@@ -626,8 +673,8 @@ int main(int argc, char **argv)
-                 }
- 
-                 if( 'f' == type[type_idx] ) {
--                    float *in_float = (float*)in_buf,
--                        *inout_float = (float*)inout_buf,
-+                    float *in_float = (float*)((char*)in_buf + op1_alignment * sizeof(float)),
-+                        *inout_float = (float*)((char*)inout_buf + res_alignment * sizeof(float)),
-                         *inout_float_for_check = (float*)inout_check_buf;
-                     for( i = 0; i < count; i++ ) {
-                         in_float[i] = 1000.0+1;
-@@ -658,8 +705,8 @@ int main(int argc, char **argv)
-                 }
- 
-                 if( 'd' == type[type_idx] ) {
--                    double *in_double = (double*)in_buf,
--                        *inout_double = (double*)inout_buf,
-+                    double *in_double = (double*)((char*)in_buf + op1_alignment * sizeof(double)),
-+                        *inout_double = (double*)((char*)inout_buf + res_alignment * sizeof(double)),
-                         *inout_double_for_check = (double*)inout_check_buf;
-                     for( i = 0; i < count; i++ ) {
-                         in_double[i] = 10.0+1;
-@@ -691,7 +738,7 @@ int main(int argc, char **argv)
-         check_and_continue:
-                 if( !skip_op_type )
-                     print_status(array_of_ops[do_ops[op_idx]].mpi_op_name,
--                                 mpi_type, type_size, count, tend-tstart, correctness);
-+                                 mpi_type, type_size, count, max_shift, duration, repeats, correctness);
-             }
-             if( !skip_op_type )
-                 printf("\n");
diff --git a/8348.patch b/8348.patch
deleted file mode 100644
index 89d5fd7..0000000
--- a/8348.patch
+++ /dev/null
@@ -1,25 +0,0 @@
-From 838568da9fce85b4555b0e0cbd899c8e8ef75696 Mon Sep 17 00:00:00 2001
-From: George Bosilca <bosilca@icl.utk.edu>
-Date: Wed, 6 Jan 2021 13:30:40 -0500
-Subject: [PATCH] A started generalized request should be marked as pending.
-
-Fixes #8340
-
-Signed-off-by: George Bosilca <bosilca@icl.utk.edu>
-(cherry picked from commit 434a2515f8aab11f505b2fca0b3d8cc41e24cef2)
----
- ompi/request/grequest.c | 1 +
- 1 file changed, 1 insertion(+)
-
-diff --git a/ompi/request/grequest.c b/ompi/request/grequest.c
-index c895b4232b6..02affd642aa 100644
---- a/ompi/request/grequest.c
-+++ b/ompi/request/grequest.c
-@@ -163,6 +163,7 @@ int ompi_grequest_start(
-     greq->greq_free.c_free = gfree_fn;
-     greq->greq_cancel.c_cancel = gcancel_fn;
-     greq->greq_base.req_status = ompi_status_empty;
-+    greq->greq_base.req_complete = REQUEST_PENDING;
- 
-     *request = &greq->greq_base;
-     return OMPI_SUCCESS;
diff --git a/openmpi.spec b/openmpi.spec
index c3483ad..a53bbc5 100644
--- a/openmpi.spec
+++ b/openmpi.spec
@@ -26,12 +26,11 @@
 %endif
 
 # Run autogen - needed for some patches
-# For Patch0
 %bcond_without autogen
 
 Name:           openmpi%{?_cc_name_suffix}
-Version:        4.1.0
-Release:        7%{?dist}
+Version:        4.1.1
+Release:        2%{?dist}
 Summary:        Open Message Passing Interface
 License:        BSD and MIT and Romio
 URL:            http://www.open-mpi.org/
@@ -42,11 +41,8 @@ Source1:        openmpi.module.in
 Source2:        openmpi.pth.py2
 Source3:        openmpi.pth.py3
 Source4:        macros.openmpi
-
-# Fix AVX library linkage
-Patch0:         https://patch-diff.githubusercontent.com/raw/open-mpi/ompi/pull/8322.patch
-# Fix generalized requests (mpi4py test failure)
-Patch1:         https://patch-diff.githubusercontent.com/raw/open-mpi/ompi/pull/8348.patch
+Patch1:         266189935aef4fce825d0db831b4b53accc62c32.patch
+Patch2:         0001-Revert-ucx-check-supported-transports-and-devices-fo.patch
 
 BuildRequires:  gcc-c++
 BuildRequires:  gcc-gfortran
@@ -82,6 +78,9 @@ BuildRequires:  perl-interpreter
 BuildRequires:  perl(Getopt::Long)
 BuildRequires:  pmix-devel
 BuildRequires:  python%{python3_pkgversion}-devel
+%ifarch x86_64
+BuildRequires:  libpsm2-devel
+%endif
 %if %{with ucx}
 BuildRequires:  ucx-devel
 %endif
@@ -91,7 +90,7 @@ BuildRequires:  rpm-mpi-hooks
 %endif
 
 Provides:       mpi
-%if 0%{?rhel}
+%if 0%{?rhel} == 7
 # Need this for /etc/profile.d/modules.sh
 Requires:       environment-modules
 %endif
@@ -169,7 +168,7 @@ OpenMPI support for Python 3.
 
 
 %prep
-%autosetup -p1
+%autosetup -p1 -n %{name}-%{version}
 %if %{with autogen}
 ./autogen.pl --force
 %endif
@@ -358,6 +357,13 @@ make check
 
 
 %changelog
+* Thu Jul 15 2021 Honggang Li <honli@redhat.com> - 4.1.1-2
+- Update to 4.1.1
+- Enable psm2 support
+- fbtl-posix: link to common_ompio
+- Revert upstream commit c36d7459b6331c4d
+- Resolve: rhbz#1869443
+
 * Fri Apr 16 2021 Mohan Boddu <mboddu@redhat.com> - 4.1.0-7
 - Rebuilt for RHEL 9 BETA on Apr 15th 2021. Related: rhbz#1947937
 
diff --git a/sources b/sources
index ebb82c2..cc0a735 100644
--- a/sources
+++ b/sources
@@ -1 +1 @@
-SHA512 (openmpi-4.1.0.tar.bz2) = eaf086ab4929ce5a9a3e867c8315bf802ff4dc75d3f05d740e22dfd97803a4559212dacbe06920d42ac6644f46057eb6980cccf5a8b0a7df9c5bdf5bffc0b3a6
+SHA512 (openmpi-4.1.1.tar.bz2) = 0d85ba45a40c0879f266e5286615e2cf94eb3570f0a705194525821d5c85d460cefc3a2da8207e6e84c479d3d0da656e2342cc2d6f88c4b4577ca22bbeacc89d