diff --git a/.gitignore b/.gitignore
index ca05fd6..104cfac 100644
--- a/.gitignore
+++ b/.gitignore
@@ -53,3 +53,4 @@ openmpi-1.4.1-RH.tar.bz2
 /openmpi-4.0.4rc1.tar.bz2
 /openmpi-4.0.4.tar.bz2
 /openmpi-4.0.5.tar.bz2
+/openmpi-4.1.0.tar.bz2
diff --git a/8322.patch b/8322.patch
new file mode 100644
index 0000000..8c04e5d
--- /dev/null
+++ b/8322.patch
@@ -0,0 +1,1682 @@
+From 31068e063b8795ae11f3a59d4080db1fe111cfaf Mon Sep 17 00:00:00 2001
+From: George Bosilca <bosilca@icl.utk.edu>
+Date: Mon, 28 Dec 2020 15:36:05 -0500
+Subject: [PATCH 1/3] Major update to the AVX* detection and support
+
+1. Consistent march flag order between configure and make.
+
+2. op/avx: give the option to skip some tests
+
+it is possible to skip some intrinsic tests by setting some environment variables to "no" before invoking configure:
+ - ompi_cv_op_avx_check_avx512
+ - ompi_cv_op_avx_check_avx2
+ - ompi_cv_op_avx_check_avx
+ - ompi_cv_op_avx_check_sse41
+ - ompi_cv_op_avx_check_sse3
+
+3. op/avx: update AVX512 flags
+
+try
+-mavx512f -mavx512bw -mavx512vl -mavx512dq
+instead of
+-march=skylake-avx512
+
+since the former is less likely to conflict with user provided CFLAGS
+(e.g. -march=...)
+
+Thanks Bart Oldeman for pointing this.
+
+4. op/avx: have the op/avx library depend on libmpi.so
+
+Refs. open-mpi/ompi#8323
+
+Signed-off-by: Gilles Gouaillardet <gilles@rist.or.jp>
+Signed-off-by: George Bosilca <bosilca@icl.utk.edu>
+---
+ ompi/mca/op/avx/Makefile.am  |   4 +-
+ ompi/mca/op/avx/configure.m4 | 325 ++++++++++++++++++-----------------
+ 2 files changed, 174 insertions(+), 155 deletions(-)
+
+diff --git a/ompi/mca/op/avx/Makefile.am b/ompi/mca/op/avx/Makefile.am
+index 41dcf2e1834..b1d84d90b33 100644
+--- a/ompi/mca/op/avx/Makefile.am
++++ b/ompi/mca/op/avx/Makefile.am
+@@ -2,7 +2,7 @@
+ # Copyright (c) 2019-2020 The University of Tennessee and The University
+ #                         of Tennessee Research Foundation.  All rights
+ #                         reserved.
+-# Copyright (c) 2020      Research Organization for Information Science
++# Copyright (c) 2020-2021 Research Organization for Information Science
+ #                         and Technology (RIST).  All rights reserved.
+ # $COPYRIGHT$
+ #
+@@ -86,7 +86,7 @@ mcacomponentdir = $(ompilibdir)
+ mcacomponent_LTLIBRARIES = $(component_install)
+ mca_op_avx_la_SOURCES = $(sources)
+ mca_op_avx_la_LIBADD = $(specialized_op_libs)
+-mca_op_avx_la_LDFLAGS = -module -avoid-version
++mca_op_avx_la_LDFLAGS = -module -avoid-version $(top_builddir)/ompi/lib@OMPI_LIBMPI_NAME@.la
+ 
+ 
+ # Specific information for static builds.
+diff --git a/ompi/mca/op/avx/configure.m4 b/ompi/mca/op/avx/configure.m4
+index 09d8b374c8e..f61b7100ef4 100644
+--- a/ompi/mca/op/avx/configure.m4
++++ b/ompi/mca/op/avx/configure.m4
+@@ -29,6 +29,13 @@ AC_DEFUN([MCA_ompi_op_avx_CONFIG],[
+     op_avx_support=0
+     op_avx2_support=0
+     op_avx512_support=0
++
++    AS_VAR_PUSHDEF([op_avx_check_sse3], [ompi_cv_op_avx_check_sse3])
++    AS_VAR_PUSHDEF([op_avx_check_sse41], [ompi_cv_op_avx_check_sse41])
++    AS_VAR_PUSHDEF([op_avx_check_avx], [ompi_cv_op_avx_check_avx])
++    AS_VAR_PUSHDEF([op_avx_check_avx2], [ompi_cv_op_avx_check_avx2])
++    AS_VAR_PUSHDEF([op_avx_check_avx512], [ompi_cv_op_avx_check_avx512])
++
+     OPAL_VAR_SCOPE_PUSH([op_avx_cflags_save])
+ 
+     AS_IF([test "$opal_cv_asm_arch" = "X86_64"],
+@@ -37,21 +44,9 @@ AC_DEFUN([MCA_ompi_op_avx_CONFIG],[
+            #
+            # Check for AVX512 support
+            #
+-           AC_MSG_CHECKING([for AVX512 support (no additional flags)])
+-           AC_LINK_IFELSE(
+-               [AC_LANG_PROGRAM([[#include <immintrin.h>]],
+-                                [[
+-    __m512 vA, vB;
+-    _mm512_add_ps(vA, vB)
+-                                ]])],
+-               [op_avx512_support=1
+-                AC_MSG_RESULT([yes])],
+-               [AC_MSG_RESULT([no])])
+-
+-           AS_IF([test $op_avx512_support -eq 0],
+-                 [AC_MSG_CHECKING([for AVX512 support (with -march=skylake-avx512)])
+-                  op_avx_cflags_save="$CFLAGS"
+-                  CFLAGS="$CFLAGS -march=skylake-avx512"
++           AC_CACHE_CHECK([if we are checking for AVX512 support], op_avx_check_avx512, AS_VAR_SET(op_avx_check_avx512, yes))
++           AS_IF([test "$op_avx_check_avx512" = "yes"],
++                 [AC_MSG_CHECKING([for AVX512 support (no additional flags)])
+                   AC_LINK_IFELSE(
+                       [AC_LANG_PROGRAM([[#include <immintrin.h>]],
+                                        [[
+@@ -59,99 +54,115 @@ AC_DEFUN([MCA_ompi_op_avx_CONFIG],[
+     _mm512_add_ps(vA, vB)
+                                        ]])],
+                       [op_avx512_support=1
+-                       MCA_BUILD_OP_AVX512_FLAGS="-march=skylake-avx512"
+                        AC_MSG_RESULT([yes])],
+                       [AC_MSG_RESULT([no])])
+-                  CFLAGS="$op_avx_cflags_save"
+-                 ])
+-           #
+-           # Some combination of gcc and older as would not correctly build the code generated by
+-           # _mm256_loadu_si256. Screen them out.
+-           #
+-           AS_IF([test $op_avx512_support -eq 1],
+-                 [AC_MSG_CHECKING([if _mm512_loadu_si512 generates code that can be compiled])
+-                  op_avx_cflags_save="$CFLAGS"
+-                  CFLAGS="$CFLAGS_WITHOUT_OPTFLAGS -O0 $MCA_BUILD_OP_AVX512_FLAGS"
+-                  AC_LINK_IFELSE(
+-                      [AC_LANG_PROGRAM([[#include <immintrin.h>]],
+-                               [[
++
++                  AS_IF([test $op_avx512_support -eq 0],
++                        [AC_MSG_CHECKING([for AVX512 support (with -mavx512f -mavx512bw -mavx512vl -mavx512dq)])
++                         op_avx_cflags_save="$CFLAGS"
++                         CFLAGS="-mavx512f -mavx512bw -mavx512vl -mavx512dq $CFLAGS"
++                         AC_LINK_IFELSE(
++                             [AC_LANG_PROGRAM([[#include <immintrin.h>]],
++                                              [[
++    __m512 vA, vB;
++    _mm512_add_ps(vA, vB)
++                                       ]])],
++                             [op_avx512_support=1
++                              MCA_BUILD_OP_AVX512_FLAGS="-mavx512f -mavx512bw -mavx512vl -mavx512dq"
++                              AC_MSG_RESULT([yes])],
++                             [AC_MSG_RESULT([no])])
++                         CFLAGS="$op_avx_cflags_save"
++                        ])
++                  #
++                  # Some combination of gcc and older as would not correctly build the code generated by
++                  # _mm256_loadu_si256. Screen them out.
++                  #
++                  AS_IF([test $op_avx512_support -eq 1],
++                        [AC_MSG_CHECKING([if _mm512_loadu_si512 generates code that can be compiled])
++                         op_avx_cflags_save="$CFLAGS"
++                         CFLAGS="$CFLAGS_WITHOUT_OPTFLAGS -O0 $MCA_BUILD_OP_AVX512_FLAGS"
++                         AC_LINK_IFELSE(
++                             [AC_LANG_PROGRAM([[#include <immintrin.h>]],
++                                      [[
+     int A[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16};
+     __m512i vA = _mm512_loadu_si512((__m512i*)&(A[1]))
+-                               ]])],
+-                      [AC_MSG_RESULT([yes])],
+-                      [op_avx512_support=0
+-                       MCA_BUILD_OP_AVX512_FLAGS=""
+-                       AC_MSG_RESULT([no])])
+-                  CFLAGS="$op_avx_cflags_save"
+-                 ])
+-           #
+-           # Some PGI compilers do not define _mm512_mullo_epi64. Screen them out.
+-           #
+-           AS_IF([test $op_avx512_support -eq 1],
+-                 [AC_MSG_CHECKING([if _mm512_mullo_epi64 generates code that can be compiled])
+-                  op_avx_cflags_save="$CFLAGS"
+-                  CFLAGS="$CFLAGS_WITHOUT_OPTFLAGS -O0 $MCA_BUILD_OP_AVX512_FLAGS"
+-                  AC_LINK_IFELSE(
+-                      [AC_LANG_PROGRAM([[#include <immintrin.h>]],
+-                               [[
++                                      ]])],
++                             [AC_MSG_RESULT([yes])],
++                             [op_avx512_support=0
++                              MCA_BUILD_OP_AVX512_FLAGS=""
++                              AC_MSG_RESULT([no])])
++                         CFLAGS="$op_avx_cflags_save"
++                        ])
++                  #
++                  # Some PGI compilers do not define _mm512_mullo_epi64. Screen them out.
++                  #
++                  AS_IF([test $op_avx512_support -eq 1],
++                        [AC_MSG_CHECKING([if _mm512_mullo_epi64 generates code that can be compiled])
++                         op_avx_cflags_save="$CFLAGS"
++                         CFLAGS="$CFLAGS_WITHOUT_OPTFLAGS -O0 $MCA_BUILD_OP_AVX512_FLAGS"
++                         AC_LINK_IFELSE(
++                             [AC_LANG_PROGRAM([[#include <immintrin.h>]],
++                                      [[
+     __m512i vA, vB;
+     _mm512_mullo_epi64(vA, vB)
+-                               ]])],
+-                      [AC_MSG_RESULT([yes])],
+-                      [op_avx512_support=0
+-                       MCA_BUILD_OP_AVX512_FLAGS=""
+-                       AC_MSG_RESULT([no])])
+-                  CFLAGS="$op_avx_cflags_save"
+-                 ])
++                                      ]])],
++                             [AC_MSG_RESULT([yes])],
++                             [op_avx512_support=0
++                              MCA_BUILD_OP_AVX512_FLAGS=""
++                              AC_MSG_RESULT([no])])
++                         CFLAGS="$op_avx_cflags_save"
++                        ])])
+            #
+            # Check support for AVX2
+            #
+-           AC_MSG_CHECKING([for AVX2 support (no additional flags)])
+-           AC_LINK_IFELSE(
+-               [AC_LANG_PROGRAM([[#include <immintrin.h>]],
+-                       [[
++           AC_CACHE_CHECK([if we are checking for AVX2 support], op_avx_check_avx2, AS_VAR_SET(op_avx_check_avx2, yes))
++           AS_IF([test "$op_avx_check_avx2" = "yes"],
++                 [AC_MSG_CHECKING([for AVX2 support (no additional flags)])
++                  AC_LINK_IFELSE(
++                      [AC_LANG_PROGRAM([[#include <immintrin.h>]],
++                              [[
+     __m256 vA, vB;
+     _mm256_add_ps(vA, vB)
+-                       ]])],
+-               [op_avx2_support=1
+-                AC_MSG_RESULT([yes])],
+-               [AC_MSG_RESULT([no])])
+-           AS_IF([test $op_avx2_support -eq 0],
+-               [AC_MSG_CHECKING([for AVX2 support (with -mavx2)])
+-                op_avx_cflags_save="$CFLAGS"
+-                CFLAGS="$CFLAGS -mavx2"
+-                AC_LINK_IFELSE(
+-                    [AC_LANG_PROGRAM([[#include <immintrin.h>]],
+-                            [[
++                              ]])],
++                      [op_avx2_support=1
++                       AC_MSG_RESULT([yes])],
++                      [AC_MSG_RESULT([no])])
++                  AS_IF([test $op_avx2_support -eq 0],
++                      [AC_MSG_CHECKING([for AVX2 support (with -mavx2)])
++                       op_avx_cflags_save="$CFLAGS"
++                       CFLAGS="-mavx2 $CFLAGS"
++                       AC_LINK_IFELSE(
++                           [AC_LANG_PROGRAM([[#include <immintrin.h>]],
++                                   [[
+     __m256 vA, vB;
+     _mm256_add_ps(vA, vB)
+-                            ]])],
+-                    [op_avx2_support=1
+-                     MCA_BUILD_OP_AVX2_FLAGS="-mavx2"
+-                     AC_MSG_RESULT([yes])],
+-                    [AC_MSG_RESULT([no])])
+-                CFLAGS="$op_avx_cflags_save"
+-                ])
+-           #
+-           # Some combination of gcc and older as would not correctly build the code generated by
+-           # _mm256_loadu_si256. Screen them out.
+-           #
+-           AS_IF([test $op_avx2_support -eq 1],
+-                 [AC_MSG_CHECKING([if _mm256_loadu_si256 generates code that can be compiled])
+-                  op_avx_cflags_save="$CFLAGS"
+-                  CFLAGS="$CFLAGS_WITHOUT_OPTFLAGS -O0 $MCA_BUILD_OP_AVX2_FLAGS"
+-                  AC_LINK_IFELSE(
+-                      [AC_LANG_PROGRAM([[#include <immintrin.h>]],
+-                               [[
++                                   ]])],
++                           [op_avx2_support=1
++                            MCA_BUILD_OP_AVX2_FLAGS="-mavx2"
++                            AC_MSG_RESULT([yes])],
++                           [AC_MSG_RESULT([no])])
++                       CFLAGS="$op_avx_cflags_save"
++                       ])
++                  #
++                  # Some combination of gcc and older as would not correctly build the code generated by
++                  # _mm256_loadu_si256. Screen them out.
++                  #
++                  AS_IF([test $op_avx2_support -eq 1],
++                        [AC_MSG_CHECKING([if _mm256_loadu_si256 generates code that can be compiled])
++                         op_avx_cflags_save="$CFLAGS"
++                         CFLAGS="$CFLAGS_WITHOUT_OPTFLAGS -O0 $MCA_BUILD_OP_AVX2_FLAGS"
++                         AC_LINK_IFELSE(
++                             [AC_LANG_PROGRAM([[#include <immintrin.h>]],
++                                      [[
+     int A[8] = {0, 1, 2, 3, 4, 5, 6, 7};
+     __m256i vA = _mm256_loadu_si256((__m256i*)&A)
+-                               ]])],
+-                      [AC_MSG_RESULT([yes])],
+-                      [op_avx2_support=0
+-                       MCA_BUILD_OP_AVX2_FLAGS=""
+-                       AC_MSG_RESULT([no])])
+-                  CFLAGS="$op_avx_cflags_save"
+-                 ])
++                                      ]])],
++                             [AC_MSG_RESULT([yes])],
++                             [op_avx2_support=0
++                              MCA_BUILD_OP_AVX2_FLAGS=""
++                              AC_MSG_RESULT([no])])
++                         CFLAGS="$op_avx_cflags_save"
++                        ])])
+            #
+            # What about early AVX support. The rest of the logic is slightly different as
+            # we need to include some of the SSE4.1 and SSE3 instructions. So, we first check
+@@ -160,90 +171,92 @@ AC_DEFUN([MCA_ompi_op_avx_CONFIG],[
+            # the AVX flag, and then recheck if we have support for the SSE4.1 and SSE3
+            # instructions.
+            #
+-           AC_MSG_CHECKING([for AVX support (no additional flags)])
+-           AC_LINK_IFELSE(
+-               [AC_LANG_PROGRAM([[#include <immintrin.h>]],
+-                       [[
++           AC_CACHE_CHECK([if we are checking for AVX support], op_avx_check_avx, AS_VAR_SET(op_avx_check_avx, yes))
++           AS_IF([test "$op_avx_check_avx" = "yes"],
++                 [AC_MSG_CHECKING([for AVX support (no additional flags)])
++                  AC_LINK_IFELSE(
++                      [AC_LANG_PROGRAM([[#include <immintrin.h>]],
++                              [[
+     __m128 vA, vB;
+     _mm_add_ps(vA, vB)
+-                       ]])],
+-               [op_avx_support=1
+-                AC_MSG_RESULT([yes])],
+-               [AC_MSG_RESULT([no])])
++                              ]])],
++                      [op_avx_support=1
++                       AC_MSG_RESULT([yes])],
++                      [AC_MSG_RESULT([no])])])
+            #
+            # Check for SSE4.1 support
+            #
+-           AS_IF([test $op_avx_support -eq 1],
+-               [AC_MSG_CHECKING([for SSE4.1 support])
+-                AC_LINK_IFELSE(
+-                    [AC_LANG_PROGRAM([[#include <immintrin.h>]],
+-                            [[
++           AC_CACHE_CHECK([if we are checking for SSE4.1 support], op_avx_check_sse41, AS_VAR_SET(op_avx_check_sse41, yes))
++           AS_IF([test $op_avx_support -eq 1 && test "$op_avx_check_sse41" = "yes"],
++                 [AC_MSG_CHECKING([for SSE4.1 support])
++                  AC_LINK_IFELSE(
++                      [AC_LANG_PROGRAM([[#include <immintrin.h>]],
++                              [[
+     __m128i vA, vB;
+     (void)_mm_max_epi8(vA, vB)
+-                            ]])],
+-                    [op_sse41_support=1
+-                     AC_MSG_RESULT([yes])],
+-                    [AC_MSG_RESULT([no])])
+-                ])
++                              ]])],
++                      [op_sse41_support=1
++                       AC_MSG_RESULT([yes])],
++                      [AC_MSG_RESULT([no])])
++                  ])
+            #
+            # Check for SSE3 support
+            #
+-           AS_IF([test $op_avx_support -eq 1],
+-               [AC_MSG_CHECKING([for SSE3 support])
+-                AC_LINK_IFELSE(
+-                    [AC_LANG_PROGRAM([[#include <immintrin.h>]],
+-                            [[
++           AC_CACHE_CHECK([if we are checking for SSE3 support], op_avx_check_sse3, AS_VAR_SET(op_avx_check_sse3, yes))
++           AS_IF([test $op_avx_support -eq 1 && test "$op_avx_check_sse3" = "yes"],
++                 [AC_MSG_CHECKING([for SSE3 support])
++                  AC_LINK_IFELSE(
++                      [AC_LANG_PROGRAM([[#include <immintrin.h>]],
++                              [[
+     int A[4] = {0, 1, 2, 3};
+     __m128i vA = _mm_lddqu_si128((__m128i*)&A)
+-                            ]])],
+-                    [op_sse3_support=1
+-                     AC_MSG_RESULT([yes])],
+-                    [AC_MSG_RESULT([no])])
+-                ])
++                              ]])],
++                      [op_sse3_support=1
++                       AC_MSG_RESULT([yes])],
++                      [AC_MSG_RESULT([no])])
++                  ])
+            # Second pass, do we need to add the AVX flag ?
+            AS_IF([test $op_avx_support -eq 0 || test $op_sse41_support -eq 0 || test $op_sse3_support -eq 0],
+-               [AC_MSG_CHECKING([for AVX support (with -mavx)])
+-                op_avx_cflags_save="$CFLAGS"
+-                CFLAGS="$CFLAGS -mavx"
+-                AC_LINK_IFELSE(
+-                    [AC_LANG_PROGRAM([[#include <immintrin.h>]],
+-                            [[
++                 [AS_IF([test "$op_avx_check_avx" = "yes"],
++                        [AC_MSG_CHECKING([for AVX support (with -mavx)])
++                         op_avx_cflags_save="$CFLAGS"
++                         CFLAGS="-mavx $CFLAGS"
++                         AC_LINK_IFELSE(
++                             [AC_LANG_PROGRAM([[#include <immintrin.h>]],
++                                   [[
+     __m128 vA, vB;
+     _mm_add_ps(vA, vB)
+                             ]])],
+-                    [op_avx_support=1
+-                     MCA_BUILD_OP_AVX_FLAGS="-mavx"
+-                     op_sse41_support=0
+-                     op_sse3_support=0
+-                     AC_MSG_RESULT([yes])],
+-                    [AC_MSG_RESULT([no])])
++                             [op_avx_support=1
++                              MCA_BUILD_OP_AVX_FLAGS="-mavx"
++                              op_sse41_support=0
++                              op_sse3_support=0
++                              AC_MSG_RESULT([yes])],
++                             [AC_MSG_RESULT([no])])])
+ 
+-                AS_IF([test $op_sse41_support -eq 0],
+-                    [AC_MSG_CHECKING([for SSE4.1 support])
+-                     AC_LINK_IFELSE(
+-                         [AC_LANG_PROGRAM([[#include <immintrin.h>]],
+-                                 [[
++                  AS_IF([test "$op_avx_check_sse41" = "yes" && test $op_sse41_support -eq 0],
++                        [AC_MSG_CHECKING([for SSE4.1 support])
++                         AC_LINK_IFELSE(
++                             [AC_LANG_PROGRAM([[#include <immintrin.h>]],
++                                     [[
+     __m128i vA, vB;
+     (void)_mm_max_epi8(vA, vB)
+-                                 ]])],
+-                         [op_sse41_support=1
+-                          AC_MSG_RESULT([yes])],
+-                         [AC_MSG_RESULT([no])])
+-                     ])
+-                AS_IF([test $op_sse3_support -eq 0],
+-                    [AC_MSG_CHECKING([for SSE3 support])
+-                     AC_LINK_IFELSE(
+-                         [AC_LANG_PROGRAM([[#include <immintrin.h>]],
++                                     ]])],
++                             [op_sse41_support=1
++                              AC_MSG_RESULT([yes])],
++                             [AC_MSG_RESULT([no])])])
++                  AS_IF([test "$op_avx_check_sse3" = "yes" && test $op_sse3_support -eq 0],
++                        [AC_MSG_CHECKING([for SSE3 support])
++                         AC_LINK_IFELSE(
++                             [AC_LANG_PROGRAM([[#include <immintrin.h>]],
+                                  [[
+     int A[4] = {0, 1, 2, 3};
+     __m128i vA = _mm_lddqu_si128((__m128i*)&A)
+                                  ]])],
+-                         [op_sse3_support=1
+-                          AC_MSG_RESULT([yes])],
+-                         [AC_MSG_RESULT([no])])
+-                     ])
+-                CFLAGS="$op_avx_cflags_save"
+-               ])
++                             [op_sse3_support=1
++                              AC_MSG_RESULT([yes])],
++                             [AC_MSG_RESULT([no])])])
++                  CFLAGS="$op_avx_cflags_save"])
+ 
+            AC_LANG_POP([C])
+           ])
+@@ -276,6 +289,12 @@ AC_DEFUN([MCA_ompi_op_avx_CONFIG],[
+     AC_SUBST(MCA_BUILD_OP_AVX2_FLAGS)
+     AC_SUBST(MCA_BUILD_OP_AVX_FLAGS)
+ 
++    AS_VAR_POPDEF([op_avx_check_avx512])
++    AS_VAR_POPDEF([op_avx_check_avx2])
++    AS_VAR_POPDEF([op_avx_check_avx])
++    AS_VAR_POPDEF([op_avx_check_sse41])
++    AS_VAR_POPDEF([op_avx_check_sse3])
++
+     OPAL_VAR_SCOPE_POP
+     # Enable this component iff we have at least the most basic form of support
+     # for vectorial ISA
+
+From fcf2766a03e3c2a1001679013878209bcddd50ae Mon Sep 17 00:00:00 2001
+From: George Bosilca <bosilca@icl.utk.edu>
+Date: Mon, 28 Dec 2020 12:18:07 -0500
+Subject: [PATCH 2/3] AVX code generation improvements
+
+1. Allow fallback to a lesser AVX support during make
+
+Due to the fact that some distro restrict the compiule architecture
+during make (while not setting any restrictions during configure) we
+need to detect the target architecture also during make in order to
+restrict the code we generate.
+
+2. Add comments and better protect the arch specific code.
+
+Identify all the vectorial functions used and clasify them according to
+the neccesary hardware capabilities.
+Use these requirements to protect the code for load and stores (the rest
+of the code being automatically generated it is more difficult to
+protect).
+
+3. Correctly check for AVX* support.
+
+Signed-off-by: George Bosilca <bosilca@icl.utk.edu>
+---
+ ompi/mca/op/avx/configure.m4       |  28 +--
+ ompi/mca/op/avx/op_avx_functions.c | 322 ++++++++++++++++++++++++-----
+ 2 files changed, 288 insertions(+), 62 deletions(-)
+
+diff --git a/ompi/mca/op/avx/configure.m4 b/ompi/mca/op/avx/configure.m4
+index f61b7100ef4..f3651f09d43 100644
+--- a/ompi/mca/op/avx/configure.m4
++++ b/ompi/mca/op/avx/configure.m4
+@@ -44,7 +44,7 @@ AC_DEFUN([MCA_ompi_op_avx_CONFIG],[
+            #
+            # Check for AVX512 support
+            #
+-           AC_CACHE_CHECK([if we are checking for AVX512 support], op_avx_check_avx512, AS_VAR_SET(op_avx_check_avx512, yes))
++           AC_CACHE_CHECK([for AVX512 support], op_avx_check_avx512, AS_VAR_SET(op_avx_check_avx512, yes))
+            AS_IF([test "$op_avx_check_avx512" = "yes"],
+                  [AC_MSG_CHECKING([for AVX512 support (no additional flags)])
+                   AC_LINK_IFELSE(
+@@ -115,14 +115,14 @@ AC_DEFUN([MCA_ompi_op_avx_CONFIG],[
+            #
+            # Check support for AVX2
+            #
+-           AC_CACHE_CHECK([if we are checking for AVX2 support], op_avx_check_avx2, AS_VAR_SET(op_avx_check_avx2, yes))
++           AC_CACHE_CHECK([for AVX2 support], op_avx_check_avx2, AS_VAR_SET(op_avx_check_avx2, yes))
+            AS_IF([test "$op_avx_check_avx2" = "yes"],
+                  [AC_MSG_CHECKING([for AVX2 support (no additional flags)])
+                   AC_LINK_IFELSE(
+                       [AC_LANG_PROGRAM([[#include <immintrin.h>]],
+                               [[
+-    __m256 vA, vB;
+-    _mm256_add_ps(vA, vB)
++    __m256i vA, vB, vC;
++    vC = _mm256_and_si256(vA, vB)
+                               ]])],
+                       [op_avx2_support=1
+                        AC_MSG_RESULT([yes])],
+@@ -134,8 +134,8 @@ AC_DEFUN([MCA_ompi_op_avx_CONFIG],[
+                        AC_LINK_IFELSE(
+                            [AC_LANG_PROGRAM([[#include <immintrin.h>]],
+                                    [[
+-    __m256 vA, vB;
+-    _mm256_add_ps(vA, vB)
++    __m256i vA, vB, vC;
++    vC = _mm256_and_si256(vA, vB)
+                                    ]])],
+                            [op_avx2_support=1
+                             MCA_BUILD_OP_AVX2_FLAGS="-mavx2"
+@@ -164,21 +164,21 @@ AC_DEFUN([MCA_ompi_op_avx_CONFIG],[
+                          CFLAGS="$op_avx_cflags_save"
+                         ])])
+            #
+-           # What about early AVX support. The rest of the logic is slightly different as
++           # What about early AVX support? The rest of the logic is slightly different as
+            # we need to include some of the SSE4.1 and SSE3 instructions. So, we first check
+            # if we can compile AVX code without a flag, then we validate that we have support
+            # for the SSE4.1 and SSE3 instructions we need. If not, we check for the usage of
+            # the AVX flag, and then recheck if we have support for the SSE4.1 and SSE3
+            # instructions.
+            #
+-           AC_CACHE_CHECK([if we are checking for AVX support], op_avx_check_avx, AS_VAR_SET(op_avx_check_avx, yes))
++           AC_CACHE_CHECK([for AVX support], op_avx_check_avx, AS_VAR_SET(op_avx_check_avx, yes))
+            AS_IF([test "$op_avx_check_avx" = "yes"],
+                  [AC_MSG_CHECKING([for AVX support (no additional flags)])
+                   AC_LINK_IFELSE(
+                       [AC_LANG_PROGRAM([[#include <immintrin.h>]],
+                               [[
+-    __m128 vA, vB;
+-    _mm_add_ps(vA, vB)
++    __m256 vA, vB, vC;
++    vC = _mm256_add_ps(vA, vB)
+                               ]])],
+                       [op_avx_support=1
+                        AC_MSG_RESULT([yes])],
+@@ -186,7 +186,7 @@ AC_DEFUN([MCA_ompi_op_avx_CONFIG],[
+            #
+            # Check for SSE4.1 support
+            #
+-           AC_CACHE_CHECK([if we are checking for SSE4.1 support], op_avx_check_sse41, AS_VAR_SET(op_avx_check_sse41, yes))
++           AC_CACHE_CHECK([for SSE4.1 support], op_avx_check_sse41, AS_VAR_SET(op_avx_check_sse41, yes))
+            AS_IF([test $op_avx_support -eq 1 && test "$op_avx_check_sse41" = "yes"],
+                  [AC_MSG_CHECKING([for SSE4.1 support])
+                   AC_LINK_IFELSE(
+@@ -202,7 +202,7 @@ AC_DEFUN([MCA_ompi_op_avx_CONFIG],[
+            #
+            # Check for SSE3 support
+            #
+-           AC_CACHE_CHECK([if we are checking for SSE3 support], op_avx_check_sse3, AS_VAR_SET(op_avx_check_sse3, yes))
++           AC_CACHE_CHECK([for SSE3 support], op_avx_check_sse3, AS_VAR_SET(op_avx_check_sse3, yes))
+            AS_IF([test $op_avx_support -eq 1 && test "$op_avx_check_sse3" = "yes"],
+                  [AC_MSG_CHECKING([for SSE3 support])
+                   AC_LINK_IFELSE(
+@@ -224,8 +224,8 @@ AC_DEFUN([MCA_ompi_op_avx_CONFIG],[
+                          AC_LINK_IFELSE(
+                              [AC_LANG_PROGRAM([[#include <immintrin.h>]],
+                                    [[
+-    __m128 vA, vB;
+-    _mm_add_ps(vA, vB)
++    __m256 vA, vB, vC;
++    vC = _mm256_add_ps(vA, vB)
+                             ]])],
+                              [op_avx_support=1
+                               MCA_BUILD_OP_AVX_FLAGS="-mavx"
+diff --git a/ompi/mca/op/avx/op_avx_functions.c b/ompi/mca/op/avx/op_avx_functions.c
+index 95a9c9ab84e..ef3f0932906 100644
+--- a/ompi/mca/op/avx/op_avx_functions.c
++++ b/ompi/mca/op/avx/op_avx_functions.c
+@@ -1,5 +1,5 @@
+ /*
+- * Copyright (c) 2019-2020 The University of Tennessee and The University
++ * Copyright (c) 2019-2021 The University of Tennessee and The University
+  *                         of Tennessee Research Foundation.  All rights
+  *                         reserved.
+  * Copyright (c) 2020      Research Organization for Information Science
+@@ -24,16 +24,42 @@
+ #include "ompi/mca/op/avx/op_avx.h"
+ 
+ #include <immintrin.h>
+-
++/**
++ * The following logic is necessary to cope with distro maintainer's desire to change the compilation
++ * flags after the configure step, leading to inconsistencies between what OMPI has detected and what
++ * code can be generated during make. If we detect that the current code generation architecture has
++ * been changed from our own setting and cannot generate the code we need (AVX512, AVX2) we fall back
++ * to a lesser support (AVX512 -> AVX2, AVX2 -> AVX, AVX -> error out).
++ */
+ #if defined(GENERATE_AVX512_CODE)
+-#define PREPEND _avx512
+-#elif defined(GENERATE_AVX2_CODE)
+-#define PREPEND _avx2
+-#elif defined(GENERATE_AVX_CODE)
+-#define PREPEND _avx
+-#else
+-#error This file should not be compiled in this conditions
+-#endif
++#  if defined(__AVX512BW__) && defined(__AVX512F__) && defined(__AVX512VL__)
++#    define PREPEND _avx512
++#  else
++#    undef GENERATE_AVX512_CODE
++#  endif  /* defined(__AVX512BW__) && defined(__AVX512F__) && defined(__AVX512VL__) */
++#endif  /* defined(GENERATE_AVX512_CODE) */
++
++#if !defined(PREPEND) && defined(GENERATE_AVX2_CODE)
++#  if defined(__AVX2__)
++#    define PREPEND _avx2
++#  else
++#    undef GENERATE_AVX2_CODE
++#  endif  /* defined(__AVX2__) */
++#endif  /* !defined(PREPEND) && defined(GENERATE_AVX2_CODE) */
++
++#if !defined(PREPEND) && defined(GENERATE_AVX_CODE)
++#  if defined(__AVX__)
++#    define PREPEND _avx
++#  endif
++#endif  /* !defined(PREPEND) && defined(GENERATE_AVX_CODE) */
++
++#if !defined(PREPEND)
++#  if OMPI_MCA_OP_HAVE_AVX512 || OMPI_MCA_OP_HAVE_AVX2
++#    error The configure step has detected possible support for AVX512 and/or AVX2 but the compiler flags during make are too restrictive. Please disable the AVX component by adding --enable-mca-no-build=op-avx to your configure step.
++#  else
++#    error This file should not be compiled in this conditions. Please provide the config.log file to the OMPI developers.
++#  endif  /* OMPI_MCA_OP_HAVE_AVX512 || OMPI_MCA_OP_HAVE_AVX2 */
++#endif  /* !defined(PREPEND) */
+ 
+ /*
+  * Concatenate preprocessor tokens A and B without expanding macro definitions
+@@ -46,6 +72,102 @@
+  */
+ #define OP_CONCAT(A, B) OP_CONCAT_NX(A, B)
+ 
++/*
++ * grep -e "_mm[125][251][862]_.*(" avx512.c -o | sed 's/(//g' | sort | uniq
++ *
++ * https://software.intel.com/sites/landingpage/IntrinsicsGuide
++ *
++ * _mm_add_epi[8,16,32,64]         SSE2
++ * _mm_add_pd                      SSE2
++ * _mm_add_ps                      SSE
++ * _mm_adds_epi[8,16]              SSE2
++ * _mm_adds_epu[8,16]              SSE2
++ * _mm_and_si128                   SSE2
++ * _mm_lddqu_si128                 SSE3
++ * _mm_loadu_pd                    SSE2
++ * _mm_loadu_ps                    SSE
++ * _mm_max_epi8                    SSE4.1
++ * _mm_max_epi16                   SSE2
++ * _mm_max_epi32                   SSE4.1
++ * _mm_max_epi64                   AVX512VL + AVX512F
++ * _mm_max_epu8                    SSE2
++ * _mm_max_epu[16,32]              SSE4.1
++ * _mm_max_epu64                   AVX512VL + AVX512F
++ * _mm_max_pd                      SSE2
++ * _mm_max_ps                      SSE
++ * _mm_min_epi8                    SSE4.1
++ * _mm_min_epi16                   SSE2
++ * _mm_min_epi32                   SSE4.1
++ * _mm_min_epi64                   AVX512VL + AVX512F
++ * _mm_min_epu8                    SSE2
++ * _mm_min_epu[16,32]              SSE4.1
++ * _mm_min_epu64                   AVX512VL + AVX512F
++ * _mm_min_pd                      SSE2
++ * _mm_min_ps                      SSE
++ * _mm_mul_pd                      SSE2
++ * _mm_mul_ps                      SSE
++ * _mm_mullo_epi16                 SSE2
++ * _mm_mullo_epi32                 SSE4.1
++ * _mm_mullo_epi64                 AVX512VL + AVX512DQ
++ * _mm_or_si128                    SSE2
++ * _mm_storeu_pd                   SSE2
++ * _mm_storeu_ps                   SSE
++ * _mm_storeu_si128                SSE2
++ * _mm_xor_si128                   SSE2
++ * _mm256_add_epi[8,16,32,64]      AVX2
++ * _mm256_add_p[s,d]               AVX
++ * _mm256_adds_epi[8,16]           AVX2
++ * _mm256_adds_epu[8,16]           AVX2
++ * _mm256_and_si256                AVX2
++ * _mm256_loadu_p[s,d]             AVX
++ * _mm256_loadu_si256              AVX
++ * _mm256_max_epi[8,16,32]         AVX2
++ * _mm256_max_epi64                AVX512VL + AVX512F
++ * _mm256_max_epu[8,16,32]         AVX2
++ * _mm256_max_epu64                AVX512VL + AVX512F
++ * _mm256_max_p[s,d]               AVX
++ * _mm256_min_epi[8,16,32]         AVX2
++ * _mm256_min_epi64                AVX512VL + AVX512F
++ * _mm256_min_epu[8,16,32]         AVX2
++ * _mm256_min_epu64                AVX512VL + AVX512F
++ * _mm256_min_p[s,d]               AVX
++ * _mm256_mul_p[s,d]               AVX
++ * _mm256_mullo_epi[16,32]         AVX2
++ * _mm256_mullo_epi64              AVX512VL + AVX512DQ
++ * _mm256_or_si256                 AVX2
++ * _mm256_storeu_p[s,d]            AVX
++ * _mm256_storeu_si256             AVX
++ * _mm256_xor_si256                AVX2
++ * _mm512_add_epi[8,16]            AVX512BW
++ * _mm512_add_epi[32,64]           AVX512F
++ * _mm512_add_p[s,d]               AVX512F
++ * _mm512_adds_epi[8,16]           AVX512BW
++ * _mm512_adds_epu[8,16]           AVX512BW
++ * _mm512_and_si512                AVX512F
++ * _mm512_cvtepi16_epi8            AVX512BW
++ * _mm512_cvtepi8_epi16            AVX512BW
++ * _mm512_loadu_p[s,d]             AVX512F
++ * _mm512_loadu_si512              AVX512F
++ * _mm512_max_epi[8,16]            AVX512BW
++ * _mm512_max_epi[32,64]           AVX512F
++ * _mm512_max_epu[8,16]            AVX512BW
++ * _mm512_max_epu[32,64]           AVX512F
++ * _mm512_max_p[s,d]               AVX512F
++ * _mm512_min_epi[8,16]            AVX512BW
++ * _mm512_min_epi[32,64]           AVX512F
++ * _mm512_min_epu[8,16]            AVX512BW
++ * _mm512_min_epu[32,64]           AVX512F
++ * _mm512_min_p[s,d]               AVX512F
++ * _mm512_mul_p[s,d]               AVX512F
++ * _mm512_mullo_epi16              AVX512BW
++ * _mm512_mullo_epi32              AVX512F
++ * _mm512_mullo_epi64              AVX512DQ
++ * _mm512_or_si512                 AVX512F
++ * _mm512_storeu_p[s,d]            AVX512F
++ * _mm512_storeu_si512             AVX512F
++ * _mm512_xor_si512                AVX512F
++ */
++
+ /*
+  * Since all the functions in this file are essentially identical, we
+  * use a macro to substitute in names and types.  The core operation
+@@ -62,13 +184,14 @@
+   (((_flag) & mca_op_avx_component.flags) == (_flag))
+ 
+ #if defined(GENERATE_AVX512_CODE) && defined(OMPI_MCA_OP_HAVE_AVX512) && (1 == OMPI_MCA_OP_HAVE_AVX512)
++#if __AVX512F__
+ #define OP_AVX_AVX512_FUNC(name, type_sign, type_size, type, op)               \
+     if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_AVX512F_FLAG|OMPI_OP_AVX_HAS_AVX512BW_FLAG) ) { \
+         int types_per_step = (512 / 8) / sizeof(type);                         \
+         for( ; left_over >= types_per_step; left_over -= types_per_step ) {    \
+-            __m512i vecA =  _mm512_loadu_si512((__m512*)in);                   \
++            __m512i vecA = _mm512_loadu_si512((__m512*)in);                    \
+             in += types_per_step;                                              \
+-            __m512i vecB =  _mm512_loadu_si512((__m512*)out);                  \
++            __m512i vecB = _mm512_loadu_si512((__m512*)out);                   \
+             __m512i res = _mm512_##op##_ep##type_sign##type_size(vecA, vecB);  \
+             _mm512_storeu_si512((__m512*)out, res);                            \
+             out += types_per_step;                                             \
+@@ -76,10 +199,14 @@
+         if( 0 == left_over ) return;                                           \
+     }
+ #else
++#error Target architecture lacks AVX512F support needed for _mm512_loadu_si512 and _mm512_storeu_si512
++#endif  /* __AVX512F__ */
++#else
+ #define OP_AVX_AVX512_FUNC(name, type_sign, type_size, type, op) {}
+ #endif  /* defined(OMPI_MCA_OP_HAVE_AVX512) && (1 == OMPI_MCA_OP_HAVE_AVX512) */
+ 
+ #if defined(GENERATE_AVX2_CODE) && defined(OMPI_MCA_OP_HAVE_AVX2) && (1 == OMPI_MCA_OP_HAVE_AVX2)
++#if __AVX__
+ #define OP_AVX_AVX2_FUNC(name, type_sign, type_size, type, op)                 \
+     if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_AVX2_FLAG | OMPI_OP_AVX_HAS_AVX_FLAG) ) {  \
+         int types_per_step = (256 / 8) / sizeof(type);  /* AVX2 */             \
+@@ -87,30 +214,37 @@
+             __m256i vecA = _mm256_loadu_si256((__m256i*)in);                   \
+             in += types_per_step;                                              \
+             __m256i vecB = _mm256_loadu_si256((__m256i*)out);                  \
+-            __m256i res =  _mm256_##op##_ep##type_sign##type_size(vecA, vecB); \
++            __m256i res = _mm256_##op##_ep##type_sign##type_size(vecA, vecB);  \
+             _mm256_storeu_si256((__m256i*)out, res);                           \
+             out += types_per_step;                                             \
+         }                                                                      \
+         if( 0 == left_over ) return;                                           \
+     }
+ #else
++#error Target architecture lacks AVX support needed for _mm256_loadu_si256 and _mm256_storeu_si256
++#endif  /* __AVX__ */
++#else
+ #define OP_AVX_AVX2_FUNC(name, type_sign, type_size, type, op) {}
+ #endif  /* defined(OMPI_MCA_OP_HAVE_AVX2) && (1 == OMPI_MCA_OP_HAVE_AVX2) */
+ 
+ #if defined(GENERATE_SSE3_CODE) && defined(OMPI_MCA_OP_HAVE_AVX) && (1 == OMPI_MCA_OP_HAVE_AVX)
++#if __SSE3__
+ #define OP_AVX_SSE4_1_FUNC(name, type_sign, type_size, type, op)               \
+     if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_SSE3_FLAG | OMPI_OP_AVX_HAS_SSE4_1_FLAG) ) { \
+-        int types_per_step = (128 / 8) / sizeof(type);  /* AVX */              \
++        int types_per_step = (128 / 8) / sizeof(type);                         \
+         for( ; left_over >= types_per_step; left_over -= types_per_step ) {    \
+             __m128i vecA = _mm_lddqu_si128((__m128i*)in);                      \
+             in += types_per_step;                                              \
+             __m128i vecB = _mm_lddqu_si128((__m128i*)out);                     \
+-            __m128i res =  _mm_##op##_ep##type_sign##type_size(vecA, vecB);    \
++            __m128i res = _mm_##op##_ep##type_sign##type_size(vecA, vecB);     \
+             _mm_storeu_si128((__m128i*)out, res);                              \
+             out += types_per_step;                                             \
+         }                                                                      \
+     }
+ #else
++#error Target architecture lacks SSE3 support needed for _mm_lddqu_si128 and _mm_storeu_si128
++#endif  /* __SSE3__ */
++#else
+ #define OP_AVX_SSE4_1_FUNC(name, type_sign, type_size, type, op) {}
+ #endif  /* defined(OMPI_MCA_OP_HAVE_AVX) && (1 == OMPI_MCA_OP_HAVE_AVX) */
+ 
+@@ -143,12 +277,13 @@ static void OP_CONCAT(ompi_op_avx_2buff_##name##_##type,PREPEND)(const void *_in
+ }
+ 
+ #if defined(GENERATE_AVX512_CODE) && defined(OMPI_MCA_OP_HAVE_AVX512) && (1 == OMPI_MCA_OP_HAVE_AVX512)
++#if __AVX512BW__ && __AVX__
+ #define OP_AVX_AVX512_MUL(name, type_sign, type_size, type, op)         \
+     if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_AVX512F_FLAG | OMPI_OP_AVX_HAS_AVX512BW_FLAG) ) {  \
+         int types_per_step = (256 / 8) / sizeof(type);                  \
+         for (; left_over >= types_per_step; left_over -= types_per_step) { \
+-            __m256i vecA_tmp =  _mm256_loadu_si256((__m256i*)in);       \
+-            __m256i vecB_tmp =  _mm256_loadu_si256((__m256i*)out);      \
++            __m256i vecA_tmp = _mm256_loadu_si256((__m256i*)in);        \
++            __m256i vecB_tmp = _mm256_loadu_si256((__m256i*)out);       \
+             in += types_per_step;                                       \
+             __m512i vecA = _mm512_cvtepi8_epi16(vecA_tmp);              \
+             __m512i vecB = _mm512_cvtepi8_epi16(vecB_tmp);              \
+@@ -160,6 +295,9 @@ static void OP_CONCAT(ompi_op_avx_2buff_##name##_##type,PREPEND)(const void *_in
+         if( 0 == left_over ) return;                                    \
+     }
+ #else
++#error Target architecture lacks AVX512BW and AVX support needed for _mm256_loadu_si256, _mm256_storeu_si256 and _mm512_cvtepi8_epi16
++#endif  /* __AVX512BW__ && __AVX__ */
++#else
+ #define OP_AVX_AVX512_MUL(name, type_sign, type_size, type, op) {}
+ #endif  /* defined(OMPI_MCA_OP_HAVE_AVX512) && (1 == OMPI_MCA_OP_HAVE_AVX512) */
+ /**
+@@ -201,13 +339,14 @@ static void OP_CONCAT( ompi_op_avx_2buff_##name##_##type, PREPEND)(const void *_
+  *
+  */
+ #if defined(GENERATE_AVX512_CODE) && defined(OMPI_MCA_OP_HAVE_AVX512) && (1 == OMPI_MCA_OP_HAVE_AVX512)
++#if __AVX512F__
+ #define OP_AVX_AVX512_BIT_FUNC(name, type_size, type, op)               \
+     if( OMPI_OP_AVX_HAS_FLAGS( OMPI_OP_AVX_HAS_AVX512F_FLAG) ) {        \
+         types_per_step = (512 / 8) / sizeof(type);                      \
+         for (; left_over >= types_per_step; left_over -= types_per_step) { \
+-            __m512i vecA =  _mm512_loadu_si512((__m512i*)in);           \
++            __m512i vecA = _mm512_loadu_si512((__m512i*)in);            \
+             in += types_per_step;                                       \
+-            __m512i vecB =  _mm512_loadu_si512((__m512i*)out);          \
++            __m512i vecB = _mm512_loadu_si512((__m512i*)out);           \
+             __m512i res = _mm512_##op##_si512(vecA, vecB);              \
+             _mm512_storeu_si512((__m512i*)out, res);                    \
+             out += types_per_step;                                      \
+@@ -215,10 +354,14 @@ static void OP_CONCAT( ompi_op_avx_2buff_##name##_##type, PREPEND)(const void *_
+         if( 0 == left_over ) return;                                    \
+     }
+ #else
++#error Target architecture lacks AVX512F support needed for _mm512_loadu_si512 and _mm512_storeu_si512
++#endif  /* __AVX512F__ */
++#else
+ #define OP_AVX_AVX512_BIT_FUNC(name, type_size, type, op) {}
+ #endif  /* defined(OMPI_MCA_OP_HAVE_AVX512) && (1 == OMPI_MCA_OP_HAVE_AVX512) */
+ 
+ #if defined(GENERATE_AVX2_CODE) && defined(OMPI_MCA_OP_HAVE_AVX2) && (1 == OMPI_MCA_OP_HAVE_AVX2)
++#if __AVX__
+ #define OP_AVX_AVX2_BIT_FUNC(name, type_size, type, op)                 \
+     if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_AVX2_FLAG | OMPI_OP_AVX_HAS_AVX_FLAG) ) { \
+         types_per_step = (256 / 8) / sizeof(type);                      \
+@@ -226,17 +369,21 @@ static void OP_CONCAT( ompi_op_avx_2buff_##name##_##type, PREPEND)(const void *_
+             __m256i vecA = _mm256_loadu_si256((__m256i*)in);            \
+             in += types_per_step;                                       \
+             __m256i vecB = _mm256_loadu_si256((__m256i*)out);           \
+-            __m256i res =  _mm256_##op##_si256(vecA, vecB);             \
++            __m256i res = _mm256_##op##_si256(vecA, vecB);              \
+             _mm256_storeu_si256((__m256i*)out, res);                    \
+             out += types_per_step;                                      \
+         }                                                               \
+         if( 0 == left_over ) return;                                    \
+     }
+ #else
++#error Target architecture lacks AVX support needed for _mm256_loadu_si256 and _mm256_storeu_si256
++#endif  /* __AVX__ */
++#else
+ #define OP_AVX_AVX2_BIT_FUNC(name, type_size, type, op) {}
+ #endif  /* defined(OMPI_MCA_OP_HAVE_AVX2) && (1 == OMPI_MCA_OP_HAVE_AVX2) */
+ 
+ #if defined(GENERATE_SSE3_CODE) && defined(OMPI_MCA_OP_HAVE_AVX) && (1 == OMPI_MCA_OP_HAVE_AVX)
++#if __SSE3__ && __SSE2__
+ #define OP_AVX_SSE3_BIT_FUNC(name, type_size, type, op)                 \
+     if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_SSE3_FLAG) ) {            \
+         types_per_step = (128 / 8) / sizeof(type);                      \
+@@ -244,12 +391,15 @@ static void OP_CONCAT( ompi_op_avx_2buff_##name##_##type, PREPEND)(const void *_
+             __m128i vecA = _mm_lddqu_si128((__m128i*)in);               \
+             in += types_per_step;                                       \
+             __m128i vecB = _mm_lddqu_si128((__m128i*)out);              \
+-            __m128i res =  _mm_##op##_si128(vecA, vecB);                \
++            __m128i res = _mm_##op##_si128(vecA, vecB);                 \
+             _mm_storeu_si128((__m128i*)out, res);                       \
+             out += types_per_step;                                      \
+         }                                                               \
+     }
+ #else
++#error Target architecture lacks SSE2 and SSE3 support needed for _mm_lddqu_si128 and _mm_storeu_si128
++#endif  /* __SSE3__ && __SSE2__ */
++#else
+ #define OP_AVX_SSE3_BIT_FUNC(name, type_size, type, op) {}
+ #endif  /* defined(OMPI_MCA_OP_HAVE_AVX) && (1 == OMPI_MCA_OP_HAVE_AVX) */
+ 
+@@ -282,12 +432,13 @@ static void OP_CONCAT(ompi_op_avx_2buff_##name##_##type,PREPEND)(const void *_in
+ }
+ 
+ #if defined(GENERATE_AVX512_CODE) && defined(OMPI_MCA_OP_HAVE_AVX512) && (1 == OMPI_MCA_OP_HAVE_AVX512)
++#if __AVX512F__
+ #define OP_AVX_AVX512_FLOAT_FUNC(op)                                    \
+     if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_AVX512F_FLAG) ) {         \
+         types_per_step = (512 / 8) / sizeof(float);                     \
+         for (; left_over >= types_per_step; left_over -= types_per_step) { \
+-            __m512 vecA =  _mm512_loadu_ps((__m512*)in);                \
+-            __m512 vecB =  _mm512_loadu_ps((__m512*)out);         \
++            __m512 vecA = _mm512_loadu_ps((__m512*)in);                 \
++            __m512 vecB = _mm512_loadu_ps((__m512*)out);                \
+             in += types_per_step;                                       \
+             __m512 res = _mm512_##op##_ps(vecA, vecB);                  \
+             _mm512_storeu_ps((__m512*)out, res);                        \
+@@ -296,28 +447,36 @@ static void OP_CONCAT(ompi_op_avx_2buff_##name##_##type,PREPEND)(const void *_in
+         if( 0 == left_over ) return;                                    \
+     }
+ #else
++#error Target architecture lacks AVX512F support needed for _mm512_loadu_ps and _mm512_storeu_ps
++#endif  /* __AVX512F__ */
++#else
+ #define OP_AVX_AVX512_FLOAT_FUNC(op) {}
+ #endif  /* defined(OMPI_MCA_OP_HAVE_AVX512) && (1 == OMPI_MCA_OP_HAVE_AVX512) */
+ 
+ #if defined(GENERATE_AVX2_CODE) && defined(OMPI_MCA_OP_HAVE_AVX2) && (1 == OMPI_MCA_OP_HAVE_AVX2)
++#if __AVX__
+ #define OP_AVX_AVX_FLOAT_FUNC(op)                                       \
+     if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_AVX_FLAG) ) {             \
+         types_per_step = (256 / 8) / sizeof(float);                     \
+         for( ; left_over >= types_per_step; left_over -= types_per_step ) { \
+-            __m256 vecA =  _mm256_loadu_ps(in);                          \
++            __m256 vecA = _mm256_loadu_ps(in);                          \
+             in += types_per_step;                                       \
+-            __m256 vecB =  _mm256_loadu_ps(out);                         \
++            __m256 vecB = _mm256_loadu_ps(out);                         \
+             __m256 res = _mm256_##op##_ps(vecA, vecB);                  \
+-            _mm256_storeu_ps(out, res);                                  \
++            _mm256_storeu_ps(out, res);                                 \
+             out += types_per_step;                                      \
+         }                                                               \
+         if( 0 == left_over ) return;                                    \
+     }
+ #else
++#error Target architecture lacks AVX support needed for _mm256_loadu_ps and _mm256_storeu_ps
++#endif  /* __AVX__ */
++#else
+ #define OP_AVX_AVX_FLOAT_FUNC(op) {}
+ #endif  /* defined(OMPI_MCA_OP_HAVE_AVX2) && (1 == OMPI_MCA_OP_HAVE_AVX2) */
+ 
+ #if defined(GENERATE_AVX_CODE) && defined(OMPI_MCA_OP_HAVE_AVX) && (1 == OMPI_MCA_OP_HAVE_AVX)
++#if __SSE__
+ #define OP_AVX_SSE_FLOAT_FUNC(op)                                       \
+     if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_SSE_FLAG) ) {             \
+         types_per_step = (128 / 8) / sizeof(float);                     \
+@@ -331,6 +490,9 @@ static void OP_CONCAT(ompi_op_avx_2buff_##name##_##type,PREPEND)(const void *_in
+         }                                                               \
+     }
+ #else
++#error Target architecture lacks SSE support needed for _mm_loadu_ps and _mm_storeu_ps
++#endif  /* __SSE__ */
++#else
+ #define OP_AVX_SSE_FLOAT_FUNC(op) {}
+ #endif  /* defined(OMPI_MCA_OP_HAVE_AVX) && (1 == OMPI_MCA_OP_HAVE_AVX) */
+ 
+@@ -363,13 +525,14 @@ static void OP_CONCAT(ompi_op_avx_2buff_##op##_float,PREPEND)(const void *_in, v
+ }
+ 
+ #if defined(GENERATE_AVX512_CODE) && defined(OMPI_MCA_OP_HAVE_AVX512) && (1 == OMPI_MCA_OP_HAVE_AVX512)
++#if __AVX512F__
+ #define OP_AVX_AVX512_DOUBLE_FUNC(op)                                   \
+     if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_AVX512F_FLAG) ) {         \
+         types_per_step = (512 / 8)  / sizeof(double);                   \
+         for (; left_over >= types_per_step; left_over -= types_per_step) { \
+-            __m512d vecA =  _mm512_loadu_pd(in);                        \
++            __m512d vecA = _mm512_loadu_pd(in);                         \
+             in += types_per_step;                                       \
+-            __m512d vecB =  _mm512_loadu_pd(out);                       \
++            __m512d vecB = _mm512_loadu_pd(out);                        \
+             __m512d res = _mm512_##op##_pd(vecA, vecB);                 \
+             _mm512_storeu_pd((out), res);                               \
+             out += types_per_step;                                      \
+@@ -377,17 +540,21 @@ static void OP_CONCAT(ompi_op_avx_2buff_##op##_float,PREPEND)(const void *_in, v
+         if( 0 == left_over ) return;                                    \
+     }
+ #else
++#error Target architecture lacks AVXF512 support needed for _mm512_loadu_pd and _mm512_storeu_pd
++#endif  /* __AVXF512__ */
++#else
+ #define OP_AVX_AVX512_DOUBLE_FUNC(op) {}
+ #endif  /* defined(OMPI_MCA_OP_HAVE_AVX512) && (1 == OMPI_MCA_OP_HAVE_AVX512) */
+ 
+ #if defined(GENERATE_AVX2_CODE) && defined(OMPI_MCA_OP_HAVE_AVX2) && (1 == OMPI_MCA_OP_HAVE_AVX2)
++#if __AVX__
+ #define OP_AVX_AVX_DOUBLE_FUNC(op)                                      \
+     if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_AVX_FLAG) ) {             \
+         types_per_step = (256 / 8)  / sizeof(double);                   \
+         for( ; left_over >= types_per_step; left_over -= types_per_step ) { \
+-            __m256d vecA =  _mm256_loadu_pd(in);                        \
++            __m256d vecA = _mm256_loadu_pd(in);                         \
+             in += types_per_step;                                       \
+-            __m256d vecB =  _mm256_loadu_pd(out);                       \
++            __m256d vecB = _mm256_loadu_pd(out);                        \
+             __m256d res = _mm256_##op##_pd(vecA, vecB);                 \
+             _mm256_storeu_pd(out, res);                                 \
+             out += types_per_step;                                      \
+@@ -395,10 +562,14 @@ static void OP_CONCAT(ompi_op_avx_2buff_##op##_float,PREPEND)(const void *_in, v
+         if( 0 == left_over ) return;                                    \
+       }
+ #else
++#error Target architecture lacks AVX support needed for _mm256_loadu_pd and _mm256_storeu_pd
++#endif  /* __AVX__ */
++#else
+ #define OP_AVX_AVX_DOUBLE_FUNC(op) {}
+ #endif  /* defined(OMPI_MCA_OP_HAVE_AVX2) && (1 == OMPI_MCA_OP_HAVE_AVX2) */
+ 
+ #if defined(GENERATE_AVX_CODE) && defined(OMPI_MCA_OP_HAVE_AVX) && (1 == OMPI_MCA_OP_HAVE_AVX)
++#if __SSE2__
+ #define OP_AVX_SSE2_DOUBLE_FUNC(op)                                     \
+     if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_SSE2_FLAG) ) {            \
+         types_per_step = (128 / 8)  / sizeof(double);                   \
+@@ -412,6 +583,9 @@ static void OP_CONCAT(ompi_op_avx_2buff_##op##_float,PREPEND)(const void *_in, v
+         }                                                               \
+     }
+ #else
++#error Target architecture lacks SSE2 support needed for _mm_loadu_pd and _mm_storeu_pd
++#endif  /* __SSE2__ */
++#else
+ #define OP_AVX_SSE2_DOUBLE_FUNC(op) {}
+ #endif  /* defined(OMPI_MCA_OP_HAVE_AVX) && (1 == OMPI_MCA_OP_HAVE_AVX) */
+ 
+@@ -580,12 +754,13 @@ static void OP_CONCAT(ompi_op_avx_2buff_##op##_double,PREPEND)(const void *_in,
+  *  routines, needed for some optimizations.
+  */
+ #if defined(GENERATE_AVX512_CODE) && defined(OMPI_MCA_OP_HAVE_AVX512) && (1 == OMPI_MCA_OP_HAVE_AVX512)
++#if __AVX512F__
+ #define OP_AVX_AVX512_FUNC_3(name, type_sign, type_size, type, op)      \
+     if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_AVX512F_FLAG|OMPI_OP_AVX_HAS_AVX512BW_FLAG) ) {   \
+         int types_per_step = (512 / 8) / sizeof(type);                  \
+         for (; left_over >= types_per_step; left_over -= types_per_step) { \
+-            __m512i vecA =  _mm512_loadu_si512(in1);                    \
+-            __m512i vecB =  _mm512_loadu_si512(in2);                    \
++            __m512i vecA = _mm512_loadu_si512(in1);                     \
++            __m512i vecB = _mm512_loadu_si512(in2);                     \
+             in1 += types_per_step;                                      \
+             in2 += types_per_step;                                      \
+             __m512i res = _mm512_##op##_ep##type_sign##type_size(vecA, vecB); \
+@@ -595,10 +770,14 @@ static void OP_CONCAT(ompi_op_avx_2buff_##op##_double,PREPEND)(const void *_in,
+         if( 0 == left_over ) return;                                    \
+     }
+ #else
++#error Target architecture lacks AVX512F support needed for _mm512_loadu_si512 and _mm512_storeu_si512
++#endif  /* __AVX512F__ */
++#else
+ #define OP_AVX_AVX512_FUNC_3(name, type_sign, type_size, type, op) {}
+ #endif  /* defined(OMPI_MCA_OP_HAVE_AVX512) && (1 == OMPI_MCA_OP_HAVE_AVX512) */
+ 
+ #if defined(GENERATE_AVX2_CODE) && defined(OMPI_MCA_OP_HAVE_AVX2) && (1 == OMPI_MCA_OP_HAVE_AVX2)
++#if __AVX__
+ #define OP_AVX_AVX2_FUNC_3(name, type_sign, type_size, type, op)        \
+     if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_AVX2_FLAG | OMPI_OP_AVX_HAS_AVX_FLAG) ) { \
+         int types_per_step = (256 / 8) / sizeof(type);                  \
+@@ -607,17 +786,21 @@ static void OP_CONCAT(ompi_op_avx_2buff_##op##_double,PREPEND)(const void *_in,
+             __m256i vecB = _mm256_loadu_si256((__m256i*)in2);           \
+             in1 += types_per_step;                                      \
+             in2 += types_per_step;                                      \
+-            __m256i res =  _mm256_##op##_ep##type_sign##type_size(vecA, vecB); \
++            __m256i res = _mm256_##op##_ep##type_sign##type_size(vecA, vecB); \
+             _mm256_storeu_si256((__m256i*)out, res);                    \
+             out += types_per_step;                                      \
+         }                                                               \
+         if( 0 == left_over ) return;                                    \
+     }
+ #else
++#error Target architecture lacks AVX support needed for _mm256_loadu_si256 and _mm256_storeu_si256
++#endif  /* __AVX__ */
++#else
+ #define OP_AVX_AVX2_FUNC_3(name, type_sign, type_size, type, op) {}
+ #endif  /* defined(OMPI_MCA_OP_HAVE_AVX2) && (1 == OMPI_MCA_OP_HAVE_AVX2) */
+ 
+ #if defined(GENERATE_SSE3_CODE) && defined(OMPI_MCA_OP_HAVE_SSE41) && (1 == OMPI_MCA_OP_HAVE_SSE41) && defined(OMPI_MCA_OP_HAVE_AVX) && (1 == OMPI_MCA_OP_HAVE_AVX)
++#if __SSE3__ && __SSE2__
+ #define OP_AVX_SSE4_1_FUNC_3(name, type_sign, type_size, type, op)      \
+     if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_SSE3_FLAG | OMPI_OP_AVX_HAS_SSE4_1_FLAG) ) {       \
+         int types_per_step = (128 / 8) / sizeof(type);                  \
+@@ -626,12 +809,15 @@ static void OP_CONCAT(ompi_op_avx_2buff_##op##_double,PREPEND)(const void *_in,
+             __m128i vecB = _mm_lddqu_si128((__m128i*)in2);              \
+             in1 += types_per_step;                                      \
+             in2 += types_per_step;                                      \
+-            __m128i res =  _mm_##op##_ep##type_sign##type_size(vecA, vecB); \
++            __m128i res = _mm_##op##_ep##type_sign##type_size(vecA, vecB); \
+             _mm_storeu_si128((__m128i*)out, res);                       \
+             out += types_per_step;                                      \
+         }                                                               \
+     }
+ #else
++#error Target architecture lacks SSE2 and SSE3 support needed for _mm_lddqu_si128 and _mm_storeu_si128
++#endif  /* __SSE3__ && __SSE2__ */
++#else
+ #define OP_AVX_SSE4_1_FUNC_3(name, type_sign, type_size, type, op) {}
+ #endif  /* defined(OMPI_MCA_OP_HAVE_AVX) && (1 == OMPI_MCA_OP_HAVE_AVX) */
+ 
+@@ -667,12 +853,13 @@ static void OP_CONCAT(ompi_op_avx_3buff_##name##_##type,PREPEND)(const void * re
+ }
+ 
+ #if defined(GENERATE_AVX512_CODE) && defined(OMPI_MCA_OP_HAVE_AVX512) && (1 == OMPI_MCA_OP_HAVE_AVX512)
++#if __AVX512BW__ && __AVX__
+ #define OP_AVX_AVX512_MUL_3(name, type_sign, type_size, type, op)       \
+     if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_AVX512F_FLAG | OMPI_OP_AVX_HAS_AVX512BW_FLAG) ) { \
+         int types_per_step = (256 / 8) / sizeof(type);                  \
+         for (; left_over >= types_per_step; left_over -= types_per_step) { \
+-            __m256i vecA_tmp =  _mm256_loadu_si256((__m256i*)in1);      \
+-            __m256i vecB_tmp =  _mm256_loadu_si256((__m256i*)in2);      \
++            __m256i vecA_tmp = _mm256_loadu_si256((__m256i*)in1);       \
++            __m256i vecB_tmp = _mm256_loadu_si256((__m256i*)in2);       \
+             in1 += types_per_step;                                      \
+             in2 += types_per_step;                                      \
+             __m512i vecA = _mm512_cvtepi8_epi16(vecA_tmp);              \
+@@ -685,6 +872,9 @@ static void OP_CONCAT(ompi_op_avx_3buff_##name##_##type,PREPEND)(const void * re
+         if( 0 == left_over ) return;                                    \
+   }
+ #else
++#error Target architecture lacks AVX512BW and AVX support needed for _mm256_loadu_si256, _mm256_storeu_si256 and _mm512_cvtepi8_epi16
++#endif  /* __AVX512BW__ && __AVX__ */
++#else
+ #define OP_AVX_AVX512_MUL_3(name, type_sign, type_size, type, op) {}
+ #endif  /* defined(OMPI_MCA_OP_HAVE_AVX512) && (1 == OMPI_MCA_OP_HAVE_AVX512) */
+ /**
+@@ -723,12 +913,13 @@ static void OP_CONCAT(ompi_op_avx_3buff_##name##_##type,PREPEND)(const void * re
+ }
+ 
+ #if defined(GENERATE_AVX512_CODE) && defined(OMPI_MCA_OP_HAVE_AVX512) && (1 == OMPI_MCA_OP_HAVE_AVX512)
++#if __AVX512F__
+ #define OP_AVX_AVX512_BIT_FUNC_3(name, type_size, type, op)             \
+     if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_AVX512F_FLAG) ) {         \
+         types_per_step = (512 / 8) / sizeof(type);                      \
+         for (; left_over >= types_per_step; left_over -= types_per_step) {  \
+-            __m512i vecA =  _mm512_loadu_si512(in1);                    \
+-            __m512i vecB =  _mm512_loadu_si512(in2);                    \
++            __m512i vecA = _mm512_loadu_si512(in1);                     \
++            __m512i vecB = _mm512_loadu_si512(in2);                     \
+             in1 += types_per_step;                                      \
+             in2 += types_per_step;                                      \
+             __m512i res = _mm512_##op##_si512(vecA, vecB);              \
+@@ -738,10 +929,14 @@ static void OP_CONCAT(ompi_op_avx_3buff_##name##_##type,PREPEND)(const void * re
+         if( 0 == left_over ) return;                                    \
+     }
+ #else
++#error Target architecture lacks AVX512F support needed for _mm512_loadu_si512 and _mm512_storeu_si512
++#endif  /* __AVX512F__ */
++#else
+ #define OP_AVX_AVX512_BIT_FUNC_3(name, type_size, type, op) {}
+ #endif  /* defined(OMPI_MCA_OP_HAVE_AVX512) && (1 == OMPI_MCA_OP_HAVE_AVX512) */
+ 
+ #if defined(GENERATE_AVX2_CODE) && defined(OMPI_MCA_OP_HAVE_AVX2) && (1 == OMPI_MCA_OP_HAVE_AVX2)
++#if __AVX__
+ #define OP_AVX_AVX2_BIT_FUNC_3(name, type_size, type, op)               \
+     if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_AVX2_FLAG | OMPI_OP_AVX_HAS_AVX_FLAG) ) { \
+         types_per_step = (256 / 8) / sizeof(type);                      \
+@@ -750,17 +945,21 @@ static void OP_CONCAT(ompi_op_avx_3buff_##name##_##type,PREPEND)(const void * re
+             __m256i vecB = _mm256_loadu_si256((__m256i*)in2);           \
+             in1 += types_per_step;                                      \
+             in2 += types_per_step;                                      \
+-            __m256i res =  _mm256_##op##_si256(vecA, vecB);             \
++            __m256i res = _mm256_##op##_si256(vecA, vecB);              \
+             _mm256_storeu_si256((__m256i*)out, res);                    \
+             out += types_per_step;                                      \
+         }                                                               \
+         if( 0 == left_over ) return;                                    \
+     }
+ #else
++#error Target architecture lacks AVX support needed for _mm256_loadu_si256 and _mm256_storeu_si256
++#endif  /* __AVX__ */
++#else
+ #define OP_AVX_AVX2_BIT_FUNC_3(name, type_size, type, op) {}
+ #endif  /* defined(OMPI_MCA_OP_HAVE_AVX2) && (1 == OMPI_MCA_OP_HAVE_AVX2) */
+ 
+ #if defined(GENERATE_SSE3_CODE) && defined(OMPI_MCA_OP_HAVE_AVX) && (1 == OMPI_MCA_OP_HAVE_AVX)
++#if __SSE3__ && __SSE2__
+ #define OP_AVX_SSE3_BIT_FUNC_3(name, type_size, type, op)               \
+     if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_SSE3_FLAG) ) {            \
+         types_per_step = (128 / 8) / sizeof(type);                      \
+@@ -769,12 +968,15 @@ static void OP_CONCAT(ompi_op_avx_3buff_##name##_##type,PREPEND)(const void * re
+             __m128i vecB = _mm_lddqu_si128((__m128i*)in2);              \
+             in1 += types_per_step;                                      \
+             in2 += types_per_step;                                      \
+-            __m128i res =  _mm_##op##_si128(vecA, vecB);                \
++            __m128i res = _mm_##op##_si128(vecA, vecB);                 \
+             _mm_storeu_si128((__m128i*)out, res);                       \
+             out += types_per_step;                                      \
+         }                                                               \
+     }
+ #else
++#error Target architecture lacks SSE2 and SSE3 support needed for _mm_lddqu_si128 and _mm_storeu_si128
++#endif  /* __SSE3__ && __SSE2__ */
++#else
+ #define OP_AVX_SSE3_BIT_FUNC_3(name, type_size, type, op) {}
+ #endif  /* defined(OMPI_MCA_OP_HAVE_AVX) && (1 == OMPI_MCA_OP_HAVE_AVX) */
+ 
+@@ -809,12 +1011,13 @@ static void OP_CONCAT(ompi_op_avx_3buff_##op##_##type,PREPEND)(const void *_in1,
+ }
+ 
+ #if defined(GENERATE_AVX512_CODE) && defined(OMPI_MCA_OP_HAVE_AVX512) && (1 == OMPI_MCA_OP_HAVE_AVX512)
++#if __AVX512F__
+ #define OP_AVX_AVX512_FLOAT_FUNC_3(op)                                  \
+     if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_AVX512F_FLAG) ) {         \
+         types_per_step = (512 / 8) / sizeof(float);                     \
+         for (; left_over >= types_per_step; left_over -= types_per_step) { \
+-            __m512 vecA =  _mm512_loadu_ps(in1);                        \
+-            __m512 vecB =  _mm512_loadu_ps(in2);                        \
++            __m512 vecA = _mm512_loadu_ps(in1);                         \
++            __m512 vecB = _mm512_loadu_ps(in2);                         \
+             in1 += types_per_step;                                      \
+             in2 += types_per_step;                                      \
+             __m512 res = _mm512_##op##_ps(vecA, vecB);                  \
+@@ -824,16 +1027,20 @@ static void OP_CONCAT(ompi_op_avx_3buff_##op##_##type,PREPEND)(const void *_in1,
+         if( 0 == left_over ) return;                                    \
+     }
+ #else
++#error Target architecture lacks AVX512F support needed for _mm512_loadu_ps and _mm512_storeu_ps
++#endif  /* __AVX512F__ */
++#else
+ #define OP_AVX_AVX512_FLOAT_FUNC_3(op) {}
+ #endif  /* defined(OMPI_MCA_OP_HAVE_AVX512) && (1 == OMPI_MCA_OP_HAVE_AVX512) */
+ 
+ #if defined(GENERATE_AVX2_CODE) && defined(OMPI_MCA_OP_HAVE_AVX2) && (1 == OMPI_MCA_OP_HAVE_AVX2)
++#if __AVX__
+ #define OP_AVX_AVX_FLOAT_FUNC_3(op)                                     \
+     if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_AVX_FLAG) ) {             \
+         types_per_step = (256 / 8) / sizeof(float);                     \
+         for( ; left_over >= types_per_step; left_over -= types_per_step ) { \
+-            __m256 vecA =  _mm256_loadu_ps(in1);                        \
+-            __m256 vecB =  _mm256_loadu_ps(in2);                        \
++            __m256 vecA = _mm256_loadu_ps(in1);                         \
++            __m256 vecB = _mm256_loadu_ps(in2);                         \
+             in1 += types_per_step;                                      \
+             in2 += types_per_step;                                      \
+             __m256 res = _mm256_##op##_ps(vecA, vecB);                  \
+@@ -843,10 +1050,14 @@ static void OP_CONCAT(ompi_op_avx_3buff_##op##_##type,PREPEND)(const void *_in1,
+         if( 0 == left_over ) return;                                    \
+     }
+ #else
++#error Target architecture lacks AVX support needed for _mm256_loadu_ps and _mm256_storeu_ps
++#endif  /* __AVX__ */
++#else
+ #define OP_AVX_AVX_FLOAT_FUNC_3(op) {}
+ #endif  /* defined(OMPI_MCA_OP_HAVE_AVX2) && (1 == OMPI_MCA_OP_HAVE_AVX2) */
+ 
+ #if defined(GENERATE_AVX_CODE) && defined(OMPI_MCA_OP_HAVE_AVX) && (1 == OMPI_MCA_OP_HAVE_AVX)
++#if __SSE__
+ #define OP_AVX_SSE_FLOAT_FUNC_3(op)                  \
+     if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_SSE_FLAG) ) {             \
+         types_per_step = (128 / 8) / sizeof(float);                     \
+@@ -861,6 +1072,9 @@ static void OP_CONCAT(ompi_op_avx_3buff_##op##_##type,PREPEND)(const void *_in1,
+         }                                                               \
+     }
+ #else
++#error Target architecture lacks SSE support needed for _mm_loadu_ps and _mm_storeu_ps
++#endif  /* __SSE__ */
++#else
+ #define OP_AVX_SSE_FLOAT_FUNC_3(op) {}
+ #endif  /* defined(OMPI_MCA_OP_HAVE_AVX) && (1 == OMPI_MCA_OP_HAVE_AVX) */
+ 
+@@ -895,12 +1109,13 @@ static void OP_CONCAT(ompi_op_avx_3buff_##op##_float,PREPEND)(const void *_in1,
+ }
+ 
+ #if defined(GENERATE_AVX512_CODE) && defined(OMPI_MCA_OP_HAVE_AVX512) && (1 == OMPI_MCA_OP_HAVE_AVX512)
++#if __AVX512F__
+ #define OP_AVX_AVX512_DOUBLE_FUNC_3(op)                                 \
+     if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_AVX512F_FLAG) ) {         \
+         types_per_step = (512 / 8) / sizeof(double);                    \
+         for (; left_over >= types_per_step; left_over -= types_per_step) { \
+-            __m512d vecA =  _mm512_loadu_pd((in1));                     \
+-            __m512d vecB =  _mm512_loadu_pd((in2));                     \
++            __m512d vecA = _mm512_loadu_pd((in1));                      \
++            __m512d vecB = _mm512_loadu_pd((in2));                      \
+             in1 += types_per_step;                                      \
+             in2 += types_per_step;                                      \
+             __m512d res = _mm512_##op##_pd(vecA, vecB);                 \
+@@ -910,16 +1125,20 @@ static void OP_CONCAT(ompi_op_avx_3buff_##op##_float,PREPEND)(const void *_in1,
+         if( 0 == left_over ) return;                                    \
+     }
+ #else
++#error Target architecture lacks AVXF512 support needed for _mm512_loadu_pd and _mm512_storeu_pd
++#endif  /* __AVXF512__ */
++#else
+ #define OP_AVX_AVX512_DOUBLE_FUNC_3(op) {}
+ #endif  /* defined(OMPI_MCA_OP_HAVE_AVX512) && (1 == OMPI_MCA_OP_HAVE_AVX512) */
+ 
+ #if defined(GENERATE_AVX2_CODE) && defined(OMPI_MCA_OP_HAVE_AVX2) && (1 == OMPI_MCA_OP_HAVE_AVX2)
++#if __AVX__
+ #define OP_AVX_AVX_DOUBLE_FUNC_3(op)                                    \
+     if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_AVX_FLAG) ) {             \
+         types_per_step = (256 / 8) / sizeof(double);                    \
+         for( ; left_over >= types_per_step; left_over -= types_per_step ) { \
+-            __m256d vecA =  _mm256_loadu_pd(in1);                       \
+-            __m256d vecB =  _mm256_loadu_pd(in2);                       \
++            __m256d vecA = _mm256_loadu_pd(in1);                        \
++            __m256d vecB = _mm256_loadu_pd(in2);                        \
+             in1 += types_per_step;                                      \
+             in2 += types_per_step;                                      \
+             __m256d res = _mm256_##op##_pd(vecA, vecB);                 \
+@@ -929,10 +1148,14 @@ static void OP_CONCAT(ompi_op_avx_3buff_##op##_float,PREPEND)(const void *_in1,
+         if( 0 == left_over ) return;                                    \
+     }
+ #else
++#error Target architecture lacks AVX support needed for _mm256_loadu_pd and _mm256_storeu_pd
++#endif  /* __AVX__ */
++#else
+ #define OP_AVX_AVX_DOUBLE_FUNC_3(op) {}
+ #endif  /* defined(OMPI_MCA_OP_HAVE_AVX2) && (1 == OMPI_MCA_OP_HAVE_AVX2) */
+ 
+ #if defined(GENERATE_AVX_CODE) && defined(OMPI_MCA_OP_HAVE_AVX) && (1 == OMPI_MCA_OP_HAVE_AVX)
++#if __SSE2__
+ #define OP_AVX_SSE2_DOUBLE_FUNC_3(op)                                   \
+     if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_SSE2_FLAG) ) {            \
+         types_per_step = (128 / 8) / sizeof(double);                    \
+@@ -947,6 +1170,9 @@ static void OP_CONCAT(ompi_op_avx_3buff_##op##_float,PREPEND)(const void *_in1,
+         }                                                               \
+     }
+ #else
++#error Target architecture lacks SSE2 support needed for _mm_loadu_pd and _mm_storeu_pd
++#endif  /* __SSE2__ */
++#else
+ #define OP_AVX_SSE2_DOUBLE_FUNC_3(op) {}
+ #endif  /* defined(OMPI_MCA_OP_HAVE_AVX) && (1 == OMPI_MCA_OP_HAVE_AVX) */
+ 
+
+From 20be3fc25713ac2de3eb4d77b85248d7fe2bc28b Mon Sep 17 00:00:00 2001
+From: George Bosilca <bosilca@icl.utk.edu>
+Date: Tue, 5 Jan 2021 22:40:26 -0500
+Subject: [PATCH 3/3] A better test for MPI_OP performance.
+
+The test now has the ability to add a shift to all or to any of the
+input and output buffers to assess the impact of unaligned operations.
+
+Signed-off-by: George Bosilca <bosilca@icl.utk.edu>
+---
+ test/datatype/reduce_local.c | 161 ++++++++++++++++++++++-------------
+ 1 file changed, 104 insertions(+), 57 deletions(-)
+
+diff --git a/test/datatype/reduce_local.c b/test/datatype/reduce_local.c
+index 97890f94227..f227439b714 100644
+--- a/test/datatype/reduce_local.c
++++ b/test/datatype/reduce_local.c
+@@ -59,7 +59,7 @@ static int total_errors = 0;
+      _a < _b ? _a : _b; })
+ 
+ static void print_status(char* op, char* type, int type_size,
+-                         int count, double duration,
++                         int count, int max_shift, double *duration, int repeats,
+                          int correct )
+ {
+     if(correct) {
+@@ -68,7 +68,15 @@ static void print_status(char* op, char* type, int type_size,
+         printf("%-10s %s [\033[1;31mfail\033[0m]", op, type);
+         total_errors++;
+     }
+-    printf(" count  %-10d  time %.6f seconds\n", count, duration);
++    if( 1 == max_shift ) {
++        printf(" count  %-10d  time (seconds) %.8f seconds\n", count, duration[0] / repeats);
++    } else {
++        printf(" count  %-10d  time (seconds / shifts) ", count);
++        for( int i = 0; i < max_shift; i++ ) {
++            printf("%.8f ", duration[i] / repeats );
++        }
++        printf("\n");
++    }
+ }
+ 
+ static int do_ops_built = 0;
+@@ -115,19 +123,23 @@ do { \
+     const TYPE *_p1 = ((TYPE*)(INBUF)), *_p3 = ((TYPE*)(CHECK_BUF)); \
+     TYPE *_p2 = ((TYPE*)(INOUT_BUF)); \
+     skip_op_type = 0; \
+-    for(int _k = 0; _k < min((COUNT), 4); +_k++ ) { \
+-        memcpy(_p2, _p3, sizeof(TYPE) * (COUNT)); \
+-        tstart = MPI_Wtime(); \
+-        MPI_Reduce_local(_p1+_k, _p2+_k, (COUNT)-_k, (MPITYPE), (MPIOP)); \
+-        tend = MPI_Wtime(); \
+-        if( check ) { \
+-            for( i = 0; i < (COUNT)-_k; i++ ) { \
+-                if(((_p2+_k)[i]) == (((_p1+_k)[i]) OPNAME ((_p3+_k)[i]))) \
+-                    continue; \
+-                printf("First error at alignment %d position %d (%" TYPE_PREFIX " %s %" TYPE_PREFIX " != %" TYPE_PREFIX ")\n", \
+-                       _k, i, (_p1+_k)[i], (#OPNAME), (_p3+_k)[i], (_p2+_k)[i]); \
+-                correctness = 0; \
+-                break; \
++    for(int _k = 0; _k < min((COUNT), max_shift); +_k++ ) { \
++        duration[_k] = 0.0; \
++        for(int _r = repeats; _r > 0; _r--) { \
++            memcpy(_p2, _p3, sizeof(TYPE) * (COUNT)); \
++            tstart = MPI_Wtime(); \
++            MPI_Reduce_local(_p1+_k, _p2+_k, (COUNT)-_k, (MPITYPE), (MPIOP)); \
++            tend = MPI_Wtime(); \
++            duration[_k] += (tend - tstart); \
++            if( check ) { \
++                for( i = 0; i < (COUNT)-_k; i++ ) { \
++                    if(((_p2+_k)[i]) == (((_p1+_k)[i]) OPNAME ((_p3+_k)[i]))) \
++                        continue; \
++                    printf("First error at alignment %d position %d (%" TYPE_PREFIX " %s %" TYPE_PREFIX " != %" TYPE_PREFIX ")\n", \
++                           _k, i, (_p1+_k)[i], (#OPNAME), (_p3+_k)[i], (_p2+_k)[i]); \
++                    correctness = 0; \
++                    break; \
++                } \
+             } \
+         } \
+     } \
+@@ -139,20 +151,24 @@ do { \
+     const TYPE *_p1 = ((TYPE*)(INBUF)), *_p3 = ((TYPE*)(CHECK_BUF)); \
+     TYPE *_p2 = ((TYPE*)(INOUT_BUF)); \
+     skip_op_type = 0; \
+-    for(int _k = 0; _k < min((COUNT), 4); +_k++ ) { \
+-        memcpy(_p2, _p3, sizeof(TYPE) * (COUNT)); \
+-        tstart = MPI_Wtime(); \
+-        MPI_Reduce_local(_p1+_k, _p2+_k, (COUNT), (MPITYPE), (MPIOP)); \
+-        tend = MPI_Wtime(); \
+-        if( check ) { \
+-            for( i = 0; i < (COUNT); i++ ) { \
+-                TYPE _v1 = *(_p1+_k), _v2 = *(_p2+_k), _v3 = *(_p3+_k); \
+-                if(_v2 == OPNAME(_v1, _v3)) \
+-                    continue; \
+-                printf("First error at alignment %d position %d (%" TYPE_PREFIX " !=  %s(%" TYPE_PREFIX ", %" TYPE_PREFIX ")\n", \
+-                       _k, i, _v1, (#OPNAME), _v3, _v2); \
+-                correctness = 0; \
+-                break; \
++    for(int _k = 0; _k < min((COUNT), max_shift); +_k++ ) { \
++        duration[_k] = 0.0; \
++        for(int _r = repeats; _r > 0; _r--) { \
++            memcpy(_p2, _p3, sizeof(TYPE) * (COUNT)); \
++            tstart = MPI_Wtime(); \
++            MPI_Reduce_local(_p1+_k, _p2+_k, (COUNT), (MPITYPE), (MPIOP)); \
++            tend = MPI_Wtime(); \
++            duration[_k] += (tend - tstart); \
++            if( check ) { \
++                for( i = 0; i < (COUNT); i++ ) { \
++                    TYPE _v1 = *(_p1+_k), _v2 = *(_p2+_k), _v3 = *(_p3+_k); \
++                    if(_v2 == OPNAME(_v1, _v3)) \
++                        continue; \
++                    printf("First error at alignment %d position %d (%" TYPE_PREFIX " !=  %s(%" TYPE_PREFIX ", %" TYPE_PREFIX ")\n", \
++                           _k, i, _v1, (#OPNAME), _v3, _v2); \
++                    correctness = 0; \
++                    break; \
++                } \
+             } \
+         } \
+     } \
+@@ -163,24 +179,36 @@ int main(int argc, char **argv)
+ {
+     static void *in_buf = NULL, *inout_buf = NULL, *inout_check_buf = NULL;
+     int count, type_size = 8, rank, size, provided, correctness = 1;
+-    int repeats = 1, i, c;
+-    double tstart, tend;
++    int repeats = 1, i, c, op1_alignment = 0, res_alignment = 0;
++    int max_shift = 4;
++    double *duration, tstart, tend;
+     bool check = true;
+     char type[5] = "uifd", *op = "sum", *mpi_type;
+     int lower = 1, upper = 1000000, skip_op_type;
+     MPI_Op mpi_op;
+ 
+-    while( -1 != (c = getopt(argc, argv, "l:u:t:o:s:n:vfh")) ) {
++    while( -1 != (c = getopt(argc, argv, "l:u:r:t:o:i:s:n:1:2:vfh")) ) {
+         switch(c) {
+         case 'l':
+             lower = atoi(optarg);
+             if( lower <= 0 ) {
+-                fprintf(stderr, "The number of elements must be positive\n");
++                fprintf(stderr, "The lower number of elements must be positive\n");
+                 exit(-1);
+             }
+             break;
+         case 'u':
+             upper = atoi(optarg);
++            if( lower <= 0 ) {
++                fprintf(stderr, "The upper number of elements must be positive\n");
++                exit(-1);
++            }
++            break;
++        case 'i':
++            max_shift = atoi(optarg);
++            if( max_shift <= 0 ) {
++                fprintf(stderr, "The max shift must be positive\n");
++                exit(-1);
++            }
+             break;
+         case 'f':
+             check = false;
+@@ -216,14 +244,32 @@ int main(int argc, char **argv)
+                 exit(-1);
+             }
+             break;
++        case '1':
++            op1_alignment = atoi(optarg);
++            if( op1_alignment < 0 ) {
++                fprintf(stderr, "alignment for the first operand must be positive\n");
++                exit(-1);
++            }
++            break;
++        case '2':
++            res_alignment = atoi(optarg);
++            if( res_alignment < 0 ) {
++                fprintf(stderr, "alignment for the result must be positive\n");
++                exit(-1);
++            }
++            break;
+         case 'h':
+             fprintf(stdout, "%s options are:\n"
+                     " -l <number> : lower number of elements\n"
+                     " -u <number> : upper number of elements\n"
+                     " -s <type_size> : 8, 16, 32 or 64 bits elements\n"
+                     " -t [i,u,f,d] : type of the elements to apply the operations on\n"
++                    " -r <number> : number of repetitions for each test\n"
+                     " -o <op> : comma separated list of operations to execute among\n"
+                     "           sum, min, max, prod, bor, bxor, band\n"
++                    " -i <number> : shift on all buffers to check alignment\n"
++                    " -1 <number> : (mis)alignment in elements for the first op\n"
++                    " -2 <number> : (mis)alignment in elements for the result\n"
+                     " -v: increase the verbosity level\n"
+                     " -h: this help message\n", argv[0]);
+             exit(0);
+@@ -233,9 +279,10 @@ int main(int argc, char **argv)
+     if( !do_ops_built ) {  /* not yet done, take the default */
+             build_do_ops( "all", do_ops);
+     }
+-    in_buf          = malloc(upper * sizeof(double));
+-    inout_buf       = malloc(upper * sizeof(double));
+-    inout_check_buf = malloc(upper * sizeof(double));
++    posix_memalign( &in_buf,          64, (upper + op1_alignment) * sizeof(double));
++    posix_memalign( &inout_buf,       64, (upper + res_alignment) * sizeof(double));
++    posix_memalign( &inout_check_buf, 64, upper * sizeof(double));
++    duration = (double*)malloc(max_shift * sizeof(double));
+ 
+     ompi_mpi_init(argc, argv, MPI_THREAD_SERIALIZED, &provided, false);
+ 
+@@ -253,8 +300,8 @@ int main(int argc, char **argv)
+                 correctness = 1;
+                 if('i' == type[type_idx]) {
+                     if( 8 == type_size ) {
+-                        int8_t *in_int8 = (int8_t*)in_buf,
+-                            *inout_int8 = (int8_t*)inout_buf,
++                        int8_t *in_int8 = (int8_t*)((char*)in_buf + op1_alignment * sizeof(int8_t)),
++                            *inout_int8 = (int8_t*)((char*)inout_buf + res_alignment * sizeof(int8_t)),
+                             *inout_int8_for_check = (int8_t*)inout_check_buf;
+                         for( i = 0; i < count; i++ ) {
+                             in_int8[i] = 5;
+@@ -299,8 +346,8 @@ int main(int argc, char **argv)
+                         }
+                     }
+                     if( 16 == type_size ) {
+-                        int16_t *in_int16 = (int16_t*)in_buf,
+-                            *inout_int16 = (int16_t*)inout_buf,
++                        int16_t *in_int16 = (int16_t*)((char*)in_buf + op1_alignment * sizeof(int16_t)),
++                            *inout_int16 = (int16_t*)((char*)inout_buf + res_alignment * sizeof(int16_t)),
+                             *inout_int16_for_check = (int16_t*)inout_check_buf;
+                         for( i = 0; i < count; i++ ) {
+                             in_int16[i] = 5;
+@@ -345,8 +392,8 @@ int main(int argc, char **argv)
+                         }
+                     }
+                     if( 32 == type_size ) {
+-                        int32_t *in_int32 = (int32_t*)in_buf,
+-                            *inout_int32 = (int32_t*)inout_buf,
++                        int32_t *in_int32 = (int32_t*)((char*)in_buf + op1_alignment * sizeof(int32_t)),
++                            *inout_int32 = (int32_t*)((char*)inout_buf + res_alignment * sizeof(int32_t)),
+                             *inout_int32_for_check = (int32_t*)inout_check_buf;
+                         for( i = 0; i < count; i++ ) {
+                             in_int32[i] = 5;
+@@ -391,8 +438,8 @@ int main(int argc, char **argv)
+                         }
+                     }
+                     if( 64 == type_size ) {
+-                        int64_t *in_int64 = (int64_t*)in_buf,
+-                            *inout_int64 = (int64_t*)inout_buf,
++                        int64_t *in_int64 = (int64_t*)((char*)in_buf + op1_alignment * sizeof(int64_t)),
++                            *inout_int64 = (int64_t*)((char*)inout_buf + res_alignment * sizeof(int64_t)),
+                             *inout_int64_for_check = (int64_t*)inout_check_buf;
+                         for( i = 0; i < count; i++ ) {
+                             in_int64[i] = 5;
+@@ -440,8 +487,8 @@ int main(int argc, char **argv)
+ 
+                 if( 'u' == type[type_idx] ) {
+                     if( 8 == type_size ) {
+-                        uint8_t *in_uint8 = (uint8_t*)in_buf,
+-                            *inout_uint8 = (uint8_t*)inout_buf,
++                        uint8_t *in_uint8 = (uint8_t*)((char*)in_buf + op1_alignment * sizeof(uint8_t)),
++                            *inout_uint8 = (uint8_t*)((char*)inout_buf + res_alignment * sizeof(uint8_t)),
+                             *inout_uint8_for_check = (uint8_t*)inout_check_buf;
+                         for( i = 0; i < count; i++ ) {
+                             in_uint8[i] = 5;
+@@ -486,8 +533,8 @@ int main(int argc, char **argv)
+                         }
+                     }
+                     if( 16 == type_size ) {
+-                        uint16_t *in_uint16 = (uint16_t*)in_buf,
+-                            *inout_uint16 = (uint16_t*)inout_buf,
++                        uint16_t *in_uint16 = (uint16_t*)((char*)in_buf + op1_alignment * sizeof(uint16_t)),
++                            *inout_uint16 = (uint16_t*)((char*)inout_buf + res_alignment * sizeof(uint16_t)),
+                             *inout_uint16_for_check = (uint16_t*)inout_check_buf;
+                         for( i = 0; i < count; i++ ) {
+                             in_uint16[i] = 5;
+@@ -532,8 +579,8 @@ int main(int argc, char **argv)
+                         }
+                     }
+                     if( 32 == type_size ) {
+-                        uint32_t *in_uint32 = (uint32_t*)in_buf,
+-                            *inout_uint32 = (uint32_t*)inout_buf,
++                        uint32_t *in_uint32 = (uint32_t*)((char*)in_buf + op1_alignment * sizeof(uint32_t)),
++                            *inout_uint32 = (uint32_t*)((char*)inout_buf + res_alignment * sizeof(uint32_t)),
+                             *inout_uint32_for_check = (uint32_t*)inout_check_buf;
+                         for( i = 0; i < count; i++ ) {
+                             in_uint32[i] = 5;
+@@ -578,8 +625,8 @@ int main(int argc, char **argv)
+                         }
+                     }
+                     if( 64 == type_size ) {
+-                        uint64_t *in_uint64 = (uint64_t*)in_buf,
+-                              *inout_uint64 = (uint64_t*)inout_buf,
++                        uint64_t *in_uint64 = (uint64_t*)((char*)in_buf + op1_alignment * sizeof(uint64_t)),
++                              *inout_uint64 = (uint64_t*)((char*)inout_buf + res_alignment * sizeof(uint64_t)),
+                             *inout_uint64_for_check = (uint64_t*)inout_check_buf;
+                         for( i = 0; i < count; i++ ) {
+                             in_uint64[i] = 5;
+@@ -626,8 +673,8 @@ int main(int argc, char **argv)
+                 }
+ 
+                 if( 'f' == type[type_idx] ) {
+-                    float *in_float = (float*)in_buf,
+-                        *inout_float = (float*)inout_buf,
++                    float *in_float = (float*)((char*)in_buf + op1_alignment * sizeof(float)),
++                        *inout_float = (float*)((char*)inout_buf + res_alignment * sizeof(float)),
+                         *inout_float_for_check = (float*)inout_check_buf;
+                     for( i = 0; i < count; i++ ) {
+                         in_float[i] = 1000.0+1;
+@@ -658,8 +705,8 @@ int main(int argc, char **argv)
+                 }
+ 
+                 if( 'd' == type[type_idx] ) {
+-                    double *in_double = (double*)in_buf,
+-                        *inout_double = (double*)inout_buf,
++                    double *in_double = (double*)((char*)in_buf + op1_alignment * sizeof(double)),
++                        *inout_double = (double*)((char*)inout_buf + res_alignment * sizeof(double)),
+                         *inout_double_for_check = (double*)inout_check_buf;
+                     for( i = 0; i < count; i++ ) {
+                         in_double[i] = 10.0+1;
+@@ -691,7 +738,7 @@ int main(int argc, char **argv)
+         check_and_continue:
+                 if( !skip_op_type )
+                     print_status(array_of_ops[do_ops[op_idx]].mpi_op_name,
+-                                 mpi_type, type_size, count, tend-tstart, correctness);
++                                 mpi_type, type_size, count, max_shift, duration, repeats, correctness);
+             }
+             if( !skip_op_type )
+                 printf("\n");
diff --git a/8348.patch b/8348.patch
new file mode 100644
index 0000000..89d5fd7
--- /dev/null
+++ b/8348.patch
@@ -0,0 +1,25 @@
+From 838568da9fce85b4555b0e0cbd899c8e8ef75696 Mon Sep 17 00:00:00 2001
+From: George Bosilca <bosilca@icl.utk.edu>
+Date: Wed, 6 Jan 2021 13:30:40 -0500
+Subject: [PATCH] A started generalized request should be marked as pending.
+
+Fixes #8340
+
+Signed-off-by: George Bosilca <bosilca@icl.utk.edu>
+(cherry picked from commit 434a2515f8aab11f505b2fca0b3d8cc41e24cef2)
+---
+ ompi/request/grequest.c | 1 +
+ 1 file changed, 1 insertion(+)
+
+diff --git a/ompi/request/grequest.c b/ompi/request/grequest.c
+index c895b4232b6..02affd642aa 100644
+--- a/ompi/request/grequest.c
++++ b/ompi/request/grequest.c
+@@ -163,6 +163,7 @@ int ompi_grequest_start(
+     greq->greq_free.c_free = gfree_fn;
+     greq->greq_cancel.c_cancel = gcancel_fn;
+     greq->greq_base.req_status = ompi_status_empty;
++    greq->greq_base.req_complete = REQUEST_PENDING;
+ 
+     *request = &greq->greq_base;
+     return OMPI_SUCCESS;
diff --git a/openmpi.spec b/openmpi.spec
index a4042cc..52b5fcc 100644
--- a/openmpi.spec
+++ b/openmpi.spec
@@ -1,17 +1,3 @@
-%global _hardened_build 1
-# We only compile with gcc, but other people may want other compilers.
-# Set the compiler here.
-%global opt_cc gcc
-# Optional CFLAGS to use with the specific compiler...gcc doesn't need any,
-# so uncomment and define to use
-#global opt_cflags
-%global opt_cxx g++
-#global opt_cxxflags
-%global opt_f77 gfortran
-#global opt_fflags
-%global opt_fc gfortran
-#global opt_fcflags
-
 # Optional name suffix to use...we leave it off when compiling with gcc, but
 # for other compiled versions to install side by side, it will need a
 # suffix in order to keep the names from conflicting.
@@ -39,70 +25,85 @@
 %bcond_without rdma
 %endif
 
-Name:            openmpi%{?_cc_name_suffix}
-Version:         4.0.5
-Release:         2%{?dist}
-Summary:         Open Message Passing Interface
-License:         BSD and MIT and Romio
-URL:             http://www.open-mpi.org/
+# Run autogen - needed for some patches
+# For Patch0
+%bcond_without autogen
+
+Name:           openmpi%{?_cc_name_suffix}
+Version:        4.1.0
+Release:        5%{?dist}
+Summary:        Open Message Passing Interface
+License:        BSD and MIT and Romio
+URL:            http://www.open-mpi.org/
 
 # We can't use %%{name} here because of _cc_name_suffix
-Source0:         https://www.open-mpi.org/software/ompi/v4.0/downloads/openmpi-%{version}.tar.bz2
-Source1:         openmpi.module.in
-Source2:         openmpi.pth.py2
-Source3:         openmpi.pth.py3
-Source4:         macros.openmpi
-
-BuildRequires:   gcc-c++
-BuildRequires:   gcc-gfortran
-BuildRequires:   valgrind-devel
+Source0:        https://www.open-mpi.org/software/ompi/v4.1/downloads/openmpi-%{version}.tar.bz2
+Source1:        openmpi.module.in
+Source2:        openmpi.pth.py2
+Source3:        openmpi.pth.py3
+Source4:        macros.openmpi
+
+# Fix AVX library linkage
+Patch0:         https://patch-diff.githubusercontent.com/raw/open-mpi/ompi/pull/8322.patch
+# Fix generalized requests (mpi4py test failure)
+Patch1:         https://patch-diff.githubusercontent.com/raw/open-mpi/ompi/pull/8348.patch
+
+BuildRequires:  gcc-c++
+BuildRequires:  gcc-gfortran
+BuildRequires:  make
+%if %{with autogen}
+BuildRequires:  libtool
+BuildRequires:  perl(Data::Dumper)
+BuildRequires:  perl(File::Find)
+%endif
+BuildRequires:  valgrind-devel
 %if %{with rdma}
-BuildRequires:   opensm-devel > 3.3.0
-BuildRequires:   rdma-core-devel
+BuildRequires:  opensm-devel > 3.3.0
+BuildRequires:  rdma-core-devel
 %endif
 # Doesn't compile:
 # vt_dyn.cc:958:28: error: 'class BPatch_basicBlockLoop' has no member named 'getLoopHead'
 #                      loop->getLoopHead()->getStartAddress(), loop_stmts );
-#BuildRequires:   dyninst-devel
-BuildRequires:   hwloc-devel
+#BuildRequires:  dyninst-devel
+BuildRequires:  hwloc-devel
 # So configure can find lstopo
-BuildRequires:   hwloc-gui
-BuildRequires:   java-devel
+BuildRequires:  hwloc-gui
+BuildRequires:  java-devel
 # Old libevent causes issues
 %if !0%{?el7}
-BuildRequires:   libevent-devel
+BuildRequires:  libevent-devel
 %endif
-BuildRequires:   libfabric-devel
+BuildRequires:  libfabric-devel
 %ifnarch s390 s390x
-BuildRequires:   papi-devel
+BuildRequires:  papi-devel
 %endif
-BuildRequires:   orangefs-devel
-BuildRequires:   perl-generators
-BuildRequires:   perl-interpreter
-BuildRequires:   perl(Getopt::Long)
-BuildRequires:   pmix-devel
-BuildRequires:   python%{python3_pkgversion}-devel
+BuildRequires:  orangefs-devel
+BuildRequires:  perl-generators
+BuildRequires:  perl-interpreter
+BuildRequires:  perl(Getopt::Long)
+BuildRequires:  pmix-devel
+BuildRequires:  python%{python3_pkgversion}-devel
 %ifarch x86_64
-BuildRequires:   infinipath-psm-devel
-BuildRequires:   libpsm2-devel
+BuildRequires:  infinipath-psm-devel
+BuildRequires:  libpsm2-devel
 %endif
 %if %{with ucx}
-BuildRequires:   ucx-devel
+BuildRequires:  ucx-devel
 %endif
-BuildRequires:   zlib-devel
+BuildRequires:  zlib-devel
 %if !0%{?el7}
-BuildRequires:   rpm-mpi-hooks
+BuildRequires:  rpm-mpi-hooks
 %endif
 
-Provides:        mpi
+Provides:       mpi
 %if 0%{?rhel}
 # Need this for /etc/profile.d/modules.sh
-Requires:        environment-modules
+Requires:       environment-modules
 %endif
-Requires:        environment(modules)
+Requires:       environment(modules)
 # openmpi currently requires ssh to run
 # https://svn.open-mpi.org/trac/ompi/ticket/4228
-Requires:        openssh-clients
+Requires:       openssh-clients
 
 # Private openmpi libraries
 %global __provides_exclude_from %{_libdir}/openmpi/lib/(lib(mca|ompi|open-(pal|rte|trace))|openmpi/).*.so
@@ -133,17 +134,17 @@ Requires:	(python(abi) = %{python3_version} if python3)
 Contains development headers and libraries for openmpi.
 
 %package java
-Summary:    Java library
-Requires:   %{name} = %{version}-%{release}
-Requires:   java-headless
+Summary:        Java library
+Requires:       %{name} = %{version}-%{release}
+Requires:       java-headless
 
 %description java
 Java library.
 
 %package java-devel
-Summary:    Java development files for openmpi
-Requires:   %{name}-java = %{version}-%{release}
-Requires:   java-devel
+Summary:        Java development files for openmpi
+Requires:       %{name}-java = %{version}-%{release}
+Requires:       java-devel
 
 %description java-devel
 Contains development wrapper for compiling Java with openmpi.
@@ -154,19 +155,19 @@ Contains development wrapper for compiling Java with openmpi.
 
 %if %{with python2}
 %package -n python2-openmpi
-Summary:    OpenMPI support for Python 2
-BuildRequires: python2-devel
-Requires:   %{name} = %{version}-%{release}
-Requires:   python(abi) = %{python2_version}
+Summary:        OpenMPI support for Python 2
+BuildRequires:  python2-devel
+Requires:       %{name} = %{version}-%{release}
+Requires:       python(abi) = %{python2_version}
 
 %description -n python2-openmpi
 OpenMPI support for Python 2.
 %endif
 
 %package -n python%{python3_pkgversion}-openmpi
-Summary:    OpenMPI support for Python 3
-Requires:   %{name} = %{version}-%{release}
-Requires:   python(abi) = %{python3_version}
+Summary:        OpenMPI support for Python 3
+Requires:       %{name} = %{version}-%{release}
+Requires:       python(abi) = %{python3_version}
 
 %description -n python%{python3_pkgversion}-openmpi
 OpenMPI support for Python 3.
@@ -174,8 +175,13 @@ OpenMPI support for Python 3.
 
 %prep
 %autosetup -p1
+%if %{with autogen}
+./autogen.pl --force
+%endif
+
 
 %build
+%set_build_flags
 ./configure --prefix=%{_libdir}/%{name} \
 	--mandir=%{_mandir}/%{namearch} \
 	--includedir=%{_includedir}/%{namearch} \
@@ -193,12 +199,6 @@ OpenMPI support for Python 3.
 	--with-libevent=external \
 	--with-pmix=external \
 %endif
-	CC=%{opt_cc} CXX=%{opt_cxx} \
-	LDFLAGS='%{__global_ldflags}' \
-	CFLAGS="%{?opt_cflags} %{!?opt_cflags:$RPM_OPT_FLAGS}" \
-	CXXFLAGS="%{?opt_cxxflags} %{!?opt_cxxflags:$RPM_OPT_FLAGS}" \
-	FC=%{opt_fc} FCFLAGS="%{?opt_fcflags} %{!?opt_fcflags:$RPM_OPT_FLAGS}"
-#        --with-contrib-vt-flags='CXXFLAGS="-I%{_includedir}/dyninst -L%{_libdir}/dyninst"' \
 
 %make_build V=1
 
@@ -281,6 +281,7 @@ make check
 %{_libdir}/%{name}/bin/shmemrun
 %endif
 %{_libdir}/%{name}/lib/*.so.40*
+%{_libdir}/%{name}/lib/libmca_common_ofi.so.10*
 %{_libdir}/%{name}/lib/libmca*.so.41*
 %{_libdir}/%{name}/lib/libmca*.so.50*
 %if 0%{?el7}
@@ -362,6 +363,22 @@ make check
 
 
 %changelog
+* Thu Jan 28 2021 Orion Poplawski <orion@nwra.com> - 4.1.0-5
+- Add upstream patch for generalized requests
+
+* Thu Jan 28 2021 Orion Poplawski <orion@nwra.com> - 4.1.0-4
+- Add upstream patch to fix AVX library linkage
+
+* Tue Jan 26 2021 Fedora Release Engineering <releng@fedoraproject.org> - 4.1.0-3
+- Rebuilt for https://fedoraproject.org/wiki/Fedora_34_Mass_Rebuild
+
+* Sun Jan 24 2021 Orion Poplawski <orion@nwra.com> - 4.1.0-2
+- Use set_build_flags macro
+- Drop old opt_ macros
+
+* Sun Jan 24 2021 Orion Poplawski <orion@nwra.com> - 4.1.0-1
+- Update to 4.1.0
+
 * Wed Sep 23 2020 Orion Poplawski <orion@nwra.com> - 4.0.5-2
 - Rebuild for libevent 2.1.12
 
diff --git a/sources b/sources
index f510896..ebb82c2 100644
--- a/sources
+++ b/sources
@@ -1 +1 @@
-SHA512 (openmpi-4.0.5.tar.bz2) = b7a1a5ccfc0eaa0f0504ff770b550480f7ae6727fa891e3310d9340a0d844a2ceddf62c2e59efd047ab9416b24c829919bbccd29606ca0e0d7a0569dad800011
+SHA512 (openmpi-4.1.0.tar.bz2) = eaf086ab4929ce5a9a3e867c8315bf802ff4dc75d3f05d740e22dfd97803a4559212dacbe06920d42ac6644f46057eb6980cccf5a8b0a7df9c5bdf5bffc0b3a6