DistroBaker 51ea11
From 31068e063b8795ae11f3a59d4080db1fe111cfaf Mon Sep 17 00:00:00 2001
DistroBaker 51ea11
From: George Bosilca <bosilca@icl.utk.edu>
DistroBaker 51ea11
Date: Mon, 28 Dec 2020 15:36:05 -0500
DistroBaker 51ea11
Subject: [PATCH 1/3] Major update to the AVX* detection and support
DistroBaker 51ea11
DistroBaker 51ea11
1. Consistent march flag order between configure and make.
DistroBaker 51ea11
DistroBaker 51ea11
2. op/avx: give the option to skip some tests
DistroBaker 51ea11
DistroBaker 51ea11
it is possible to skip some intrinsic tests by setting some environment variables to "no" before invoking configure:
DistroBaker 51ea11
 - ompi_cv_op_avx_check_avx512
DistroBaker 51ea11
 - ompi_cv_op_avx_check_avx2
DistroBaker 51ea11
 - ompi_cv_op_avx_check_avx
DistroBaker 51ea11
 - ompi_cv_op_avx_check_sse41
DistroBaker 51ea11
 - ompi_cv_op_avx_check_sse3
DistroBaker 51ea11
DistroBaker 51ea11
3. op/avx: update AVX512 flags
DistroBaker 51ea11
DistroBaker 51ea11
try
DistroBaker 51ea11
-mavx512f -mavx512bw -mavx512vl -mavx512dq
DistroBaker 51ea11
instead of
DistroBaker 51ea11
-march=skylake-avx512
DistroBaker 51ea11
DistroBaker 51ea11
since the former is less likely to conflict with user provided CFLAGS
DistroBaker 51ea11
(e.g. -march=...)
DistroBaker 51ea11
DistroBaker 51ea11
Thanks Bart Oldeman for pointing this.
DistroBaker 51ea11
DistroBaker 51ea11
4. op/avx: have the op/avx library depend on libmpi.so
DistroBaker 51ea11
DistroBaker 51ea11
Refs. open-mpi/ompi#8323
DistroBaker 51ea11
DistroBaker 51ea11
Signed-off-by: Gilles Gouaillardet <gilles@rist.or.jp>
DistroBaker 51ea11
Signed-off-by: George Bosilca <bosilca@icl.utk.edu>
DistroBaker 51ea11
---
DistroBaker 51ea11
 ompi/mca/op/avx/Makefile.am  |   4 +-
DistroBaker 51ea11
 ompi/mca/op/avx/configure.m4 | 325 ++++++++++++++++++-----------------
DistroBaker 51ea11
 2 files changed, 174 insertions(+), 155 deletions(-)
DistroBaker 51ea11
DistroBaker 51ea11
diff --git a/ompi/mca/op/avx/Makefile.am b/ompi/mca/op/avx/Makefile.am
DistroBaker 51ea11
index 41dcf2e1834..b1d84d90b33 100644
DistroBaker 51ea11
--- a/ompi/mca/op/avx/Makefile.am
DistroBaker 51ea11
+++ b/ompi/mca/op/avx/Makefile.am
DistroBaker 51ea11
@@ -2,7 +2,7 @@
DistroBaker 51ea11
 # Copyright (c) 2019-2020 The University of Tennessee and The University
DistroBaker 51ea11
 #                         of Tennessee Research Foundation.  All rights
DistroBaker 51ea11
 #                         reserved.
DistroBaker 51ea11
-# Copyright (c) 2020      Research Organization for Information Science
DistroBaker 51ea11
+# Copyright (c) 2020-2021 Research Organization for Information Science
DistroBaker 51ea11
 #                         and Technology (RIST).  All rights reserved.
DistroBaker 51ea11
 # $COPYRIGHT$
DistroBaker 51ea11
 #
DistroBaker 51ea11
@@ -86,7 +86,7 @@ mcacomponentdir = $(ompilibdir)
DistroBaker 51ea11
 mcacomponent_LTLIBRARIES = $(component_install)
DistroBaker 51ea11
 mca_op_avx_la_SOURCES = $(sources)
DistroBaker 51ea11
 mca_op_avx_la_LIBADD = $(specialized_op_libs)
DistroBaker 51ea11
-mca_op_avx_la_LDFLAGS = -module -avoid-version
DistroBaker 51ea11
+mca_op_avx_la_LDFLAGS = -module -avoid-version $(top_builddir)/ompi/lib@OMPI_LIBMPI_NAME@.la
DistroBaker 51ea11
 
DistroBaker 51ea11
 
DistroBaker 51ea11
 # Specific information for static builds.
DistroBaker 51ea11
diff --git a/ompi/mca/op/avx/configure.m4 b/ompi/mca/op/avx/configure.m4
DistroBaker 51ea11
index 09d8b374c8e..f61b7100ef4 100644
DistroBaker 51ea11
--- a/ompi/mca/op/avx/configure.m4
DistroBaker 51ea11
+++ b/ompi/mca/op/avx/configure.m4
DistroBaker 51ea11
@@ -29,6 +29,13 @@ AC_DEFUN([MCA_ompi_op_avx_CONFIG],[
DistroBaker 51ea11
     op_avx_support=0
DistroBaker 51ea11
     op_avx2_support=0
DistroBaker 51ea11
     op_avx512_support=0
DistroBaker 51ea11
+
DistroBaker 51ea11
+    AS_VAR_PUSHDEF([op_avx_check_sse3], [ompi_cv_op_avx_check_sse3])
DistroBaker 51ea11
+    AS_VAR_PUSHDEF([op_avx_check_sse41], [ompi_cv_op_avx_check_sse41])
DistroBaker 51ea11
+    AS_VAR_PUSHDEF([op_avx_check_avx], [ompi_cv_op_avx_check_avx])
DistroBaker 51ea11
+    AS_VAR_PUSHDEF([op_avx_check_avx2], [ompi_cv_op_avx_check_avx2])
DistroBaker 51ea11
+    AS_VAR_PUSHDEF([op_avx_check_avx512], [ompi_cv_op_avx_check_avx512])
DistroBaker 51ea11
+
DistroBaker 51ea11
     OPAL_VAR_SCOPE_PUSH([op_avx_cflags_save])
DistroBaker 51ea11
 
DistroBaker 51ea11
     AS_IF([test "$opal_cv_asm_arch" = "X86_64"],
DistroBaker 51ea11
@@ -37,21 +44,9 @@ AC_DEFUN([MCA_ompi_op_avx_CONFIG],[
DistroBaker 51ea11
            #
DistroBaker 51ea11
            # Check for AVX512 support
DistroBaker 51ea11
            #
DistroBaker 51ea11
-           AC_MSG_CHECKING([for AVX512 support (no additional flags)])
DistroBaker 51ea11
-           AC_LINK_IFELSE(
DistroBaker 51ea11
-               [AC_LANG_PROGRAM([[#include <immintrin.h>]],
DistroBaker 51ea11
-                                [[
DistroBaker 51ea11
-    __m512 vA, vB;
DistroBaker 51ea11
-    _mm512_add_ps(vA, vB)
DistroBaker 51ea11
-                                ]])],
DistroBaker 51ea11
-               [op_avx512_support=1
DistroBaker 51ea11
-                AC_MSG_RESULT([yes])],
DistroBaker 51ea11
-               [AC_MSG_RESULT([no])])
DistroBaker 51ea11
-
DistroBaker 51ea11
-           AS_IF([test $op_avx512_support -eq 0],
DistroBaker 51ea11
-                 [AC_MSG_CHECKING([for AVX512 support (with -march=skylake-avx512)])
DistroBaker 51ea11
-                  op_avx_cflags_save="$CFLAGS"
DistroBaker 51ea11
-                  CFLAGS="$CFLAGS -march=skylake-avx512"
DistroBaker 51ea11
+           AC_CACHE_CHECK([if we are checking for AVX512 support], op_avx_check_avx512, AS_VAR_SET(op_avx_check_avx512, yes))
DistroBaker 51ea11
+           AS_IF([test "$op_avx_check_avx512" = "yes"],
DistroBaker 51ea11
+                 [AC_MSG_CHECKING([for AVX512 support (no additional flags)])
DistroBaker 51ea11
                   AC_LINK_IFELSE(
DistroBaker 51ea11
                       [AC_LANG_PROGRAM([[#include <immintrin.h>]],
DistroBaker 51ea11
                                        [[
DistroBaker 51ea11
@@ -59,99 +54,115 @@ AC_DEFUN([MCA_ompi_op_avx_CONFIG],[
DistroBaker 51ea11
     _mm512_add_ps(vA, vB)
DistroBaker 51ea11
                                        ]])],
DistroBaker 51ea11
                       [op_avx512_support=1
DistroBaker 51ea11
-                       MCA_BUILD_OP_AVX512_FLAGS="-march=skylake-avx512"
DistroBaker 51ea11
                        AC_MSG_RESULT([yes])],
DistroBaker 51ea11
                       [AC_MSG_RESULT([no])])
DistroBaker 51ea11
-                  CFLAGS="$op_avx_cflags_save"
DistroBaker 51ea11
-                 ])
DistroBaker 51ea11
-           #
DistroBaker 51ea11
-           # Some combination of gcc and older as would not correctly build the code generated by
DistroBaker 51ea11
-           # _mm256_loadu_si256. Screen them out.
DistroBaker 51ea11
-           #
DistroBaker 51ea11
-           AS_IF([test $op_avx512_support -eq 1],
DistroBaker 51ea11
-                 [AC_MSG_CHECKING([if _mm512_loadu_si512 generates code that can be compiled])
DistroBaker 51ea11
-                  op_avx_cflags_save="$CFLAGS"
DistroBaker 51ea11
-                  CFLAGS="$CFLAGS_WITHOUT_OPTFLAGS -O0 $MCA_BUILD_OP_AVX512_FLAGS"
DistroBaker 51ea11
-                  AC_LINK_IFELSE(
DistroBaker 51ea11
-                      [AC_LANG_PROGRAM([[#include <immintrin.h>]],
DistroBaker 51ea11
-                               [[
DistroBaker 51ea11
+
DistroBaker 51ea11
+                  AS_IF([test $op_avx512_support -eq 0],
DistroBaker 51ea11
+                        [AC_MSG_CHECKING([for AVX512 support (with -mavx512f -mavx512bw -mavx512vl -mavx512dq)])
DistroBaker 51ea11
+                         op_avx_cflags_save="$CFLAGS"
DistroBaker 51ea11
+                         CFLAGS="-mavx512f -mavx512bw -mavx512vl -mavx512dq $CFLAGS"
DistroBaker 51ea11
+                         AC_LINK_IFELSE(
DistroBaker 51ea11
+                             [AC_LANG_PROGRAM([[#include <immintrin.h>]],
DistroBaker 51ea11
+                                              [[
DistroBaker 51ea11
+    __m512 vA, vB;
DistroBaker 51ea11
+    _mm512_add_ps(vA, vB)
DistroBaker 51ea11
+                                       ]])],
DistroBaker 51ea11
+                             [op_avx512_support=1
DistroBaker 51ea11
+                              MCA_BUILD_OP_AVX512_FLAGS="-mavx512f -mavx512bw -mavx512vl -mavx512dq"
DistroBaker 51ea11
+                              AC_MSG_RESULT([yes])],
DistroBaker 51ea11
+                             [AC_MSG_RESULT([no])])
DistroBaker 51ea11
+                         CFLAGS="$op_avx_cflags_save"
DistroBaker 51ea11
+                        ])
DistroBaker 51ea11
+                  #
DistroBaker 51ea11
+                  # Some combination of gcc and older as would not correctly build the code generated by
DistroBaker 51ea11
+                  # _mm256_loadu_si256. Screen them out.
DistroBaker 51ea11
+                  #
DistroBaker 51ea11
+                  AS_IF([test $op_avx512_support -eq 1],
DistroBaker 51ea11
+                        [AC_MSG_CHECKING([if _mm512_loadu_si512 generates code that can be compiled])
DistroBaker 51ea11
+                         op_avx_cflags_save="$CFLAGS"
DistroBaker 51ea11
+                         CFLAGS="$CFLAGS_WITHOUT_OPTFLAGS -O0 $MCA_BUILD_OP_AVX512_FLAGS"
DistroBaker 51ea11
+                         AC_LINK_IFELSE(
DistroBaker 51ea11
+                             [AC_LANG_PROGRAM([[#include <immintrin.h>]],
DistroBaker 51ea11
+                                      [[
DistroBaker 51ea11
     int A[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16};
DistroBaker 51ea11
     __m512i vA = _mm512_loadu_si512((__m512i*)&(A[1]))
DistroBaker 51ea11
-                               ]])],
DistroBaker 51ea11
-                      [AC_MSG_RESULT([yes])],
DistroBaker 51ea11
-                      [op_avx512_support=0
DistroBaker 51ea11
-                       MCA_BUILD_OP_AVX512_FLAGS=""
DistroBaker 51ea11
-                       AC_MSG_RESULT([no])])
DistroBaker 51ea11
-                  CFLAGS="$op_avx_cflags_save"
DistroBaker 51ea11
-                 ])
DistroBaker 51ea11
-           #
DistroBaker 51ea11
-           # Some PGI compilers do not define _mm512_mullo_epi64. Screen them out.
DistroBaker 51ea11
-           #
DistroBaker 51ea11
-           AS_IF([test $op_avx512_support -eq 1],
DistroBaker 51ea11
-                 [AC_MSG_CHECKING([if _mm512_mullo_epi64 generates code that can be compiled])
DistroBaker 51ea11
-                  op_avx_cflags_save="$CFLAGS"
DistroBaker 51ea11
-                  CFLAGS="$CFLAGS_WITHOUT_OPTFLAGS -O0 $MCA_BUILD_OP_AVX512_FLAGS"
DistroBaker 51ea11
-                  AC_LINK_IFELSE(
DistroBaker 51ea11
-                      [AC_LANG_PROGRAM([[#include <immintrin.h>]],
DistroBaker 51ea11
-                               [[
DistroBaker 51ea11
+                                      ]])],
DistroBaker 51ea11
+                             [AC_MSG_RESULT([yes])],
DistroBaker 51ea11
+                             [op_avx512_support=0
DistroBaker 51ea11
+                              MCA_BUILD_OP_AVX512_FLAGS=""
DistroBaker 51ea11
+                              AC_MSG_RESULT([no])])
DistroBaker 51ea11
+                         CFLAGS="$op_avx_cflags_save"
DistroBaker 51ea11
+                        ])
DistroBaker 51ea11
+                  #
DistroBaker 51ea11
+                  # Some PGI compilers do not define _mm512_mullo_epi64. Screen them out.
DistroBaker 51ea11
+                  #
DistroBaker 51ea11
+                  AS_IF([test $op_avx512_support -eq 1],
DistroBaker 51ea11
+                        [AC_MSG_CHECKING([if _mm512_mullo_epi64 generates code that can be compiled])
DistroBaker 51ea11
+                         op_avx_cflags_save="$CFLAGS"
DistroBaker 51ea11
+                         CFLAGS="$CFLAGS_WITHOUT_OPTFLAGS -O0 $MCA_BUILD_OP_AVX512_FLAGS"
DistroBaker 51ea11
+                         AC_LINK_IFELSE(
DistroBaker 51ea11
+                             [AC_LANG_PROGRAM([[#include <immintrin.h>]],
DistroBaker 51ea11
+                                      [[
DistroBaker 51ea11
     __m512i vA, vB;
DistroBaker 51ea11
     _mm512_mullo_epi64(vA, vB)
DistroBaker 51ea11
-                               ]])],
DistroBaker 51ea11
-                      [AC_MSG_RESULT([yes])],
DistroBaker 51ea11
-                      [op_avx512_support=0
DistroBaker 51ea11
-                       MCA_BUILD_OP_AVX512_FLAGS=""
DistroBaker 51ea11
-                       AC_MSG_RESULT([no])])
DistroBaker 51ea11
-                  CFLAGS="$op_avx_cflags_save"
DistroBaker 51ea11
-                 ])
DistroBaker 51ea11
+                                      ]])],
DistroBaker 51ea11
+                             [AC_MSG_RESULT([yes])],
DistroBaker 51ea11
+                             [op_avx512_support=0
DistroBaker 51ea11
+                              MCA_BUILD_OP_AVX512_FLAGS=""
DistroBaker 51ea11
+                              AC_MSG_RESULT([no])])
DistroBaker 51ea11
+                         CFLAGS="$op_avx_cflags_save"
DistroBaker 51ea11
+                        ])])
DistroBaker 51ea11
            #
DistroBaker 51ea11
            # Check support for AVX2
DistroBaker 51ea11
            #
DistroBaker 51ea11
-           AC_MSG_CHECKING([for AVX2 support (no additional flags)])
DistroBaker 51ea11
-           AC_LINK_IFELSE(
DistroBaker 51ea11
-               [AC_LANG_PROGRAM([[#include <immintrin.h>]],
DistroBaker 51ea11
-                       [[
DistroBaker 51ea11
+           AC_CACHE_CHECK([if we are checking for AVX2 support], op_avx_check_avx2, AS_VAR_SET(op_avx_check_avx2, yes))
DistroBaker 51ea11
+           AS_IF([test "$op_avx_check_avx2" = "yes"],
DistroBaker 51ea11
+                 [AC_MSG_CHECKING([for AVX2 support (no additional flags)])
DistroBaker 51ea11
+                  AC_LINK_IFELSE(
DistroBaker 51ea11
+                      [AC_LANG_PROGRAM([[#include <immintrin.h>]],
DistroBaker 51ea11
+                              [[
DistroBaker 51ea11
     __m256 vA, vB;
DistroBaker 51ea11
     _mm256_add_ps(vA, vB)
DistroBaker 51ea11
-                       ]])],
DistroBaker 51ea11
-               [op_avx2_support=1
DistroBaker 51ea11
-                AC_MSG_RESULT([yes])],
DistroBaker 51ea11
-               [AC_MSG_RESULT([no])])
DistroBaker 51ea11
-           AS_IF([test $op_avx2_support -eq 0],
DistroBaker 51ea11
-               [AC_MSG_CHECKING([for AVX2 support (with -mavx2)])
DistroBaker 51ea11
-                op_avx_cflags_save="$CFLAGS"
DistroBaker 51ea11
-                CFLAGS="$CFLAGS -mavx2"
DistroBaker 51ea11
-                AC_LINK_IFELSE(
DistroBaker 51ea11
-                    [AC_LANG_PROGRAM([[#include <immintrin.h>]],
DistroBaker 51ea11
-                            [[
DistroBaker 51ea11
+                              ]])],
DistroBaker 51ea11
+                      [op_avx2_support=1
DistroBaker 51ea11
+                       AC_MSG_RESULT([yes])],
DistroBaker 51ea11
+                      [AC_MSG_RESULT([no])])
DistroBaker 51ea11
+                  AS_IF([test $op_avx2_support -eq 0],
DistroBaker 51ea11
+                      [AC_MSG_CHECKING([for AVX2 support (with -mavx2)])
DistroBaker 51ea11
+                       op_avx_cflags_save="$CFLAGS"
DistroBaker 51ea11
+                       CFLAGS="-mavx2 $CFLAGS"
DistroBaker 51ea11
+                       AC_LINK_IFELSE(
DistroBaker 51ea11
+                           [AC_LANG_PROGRAM([[#include <immintrin.h>]],
DistroBaker 51ea11
+                                   [[
DistroBaker 51ea11
     __m256 vA, vB;
DistroBaker 51ea11
     _mm256_add_ps(vA, vB)
DistroBaker 51ea11
-                            ]])],
DistroBaker 51ea11
-                    [op_avx2_support=1
DistroBaker 51ea11
-                     MCA_BUILD_OP_AVX2_FLAGS="-mavx2"
DistroBaker 51ea11
-                     AC_MSG_RESULT([yes])],
DistroBaker 51ea11
-                    [AC_MSG_RESULT([no])])
DistroBaker 51ea11
-                CFLAGS="$op_avx_cflags_save"
DistroBaker 51ea11
-                ])
DistroBaker 51ea11
-           #
DistroBaker 51ea11
-           # Some combination of gcc and older as would not correctly build the code generated by
DistroBaker 51ea11
-           # _mm256_loadu_si256. Screen them out.
DistroBaker 51ea11
-           #
DistroBaker 51ea11
-           AS_IF([test $op_avx2_support -eq 1],
DistroBaker 51ea11
-                 [AC_MSG_CHECKING([if _mm256_loadu_si256 generates code that can be compiled])
DistroBaker 51ea11
-                  op_avx_cflags_save="$CFLAGS"
DistroBaker 51ea11
-                  CFLAGS="$CFLAGS_WITHOUT_OPTFLAGS -O0 $MCA_BUILD_OP_AVX2_FLAGS"
DistroBaker 51ea11
-                  AC_LINK_IFELSE(
DistroBaker 51ea11
-                      [AC_LANG_PROGRAM([[#include <immintrin.h>]],
DistroBaker 51ea11
-                               [[
DistroBaker 51ea11
+                                   ]])],
DistroBaker 51ea11
+                           [op_avx2_support=1
DistroBaker 51ea11
+                            MCA_BUILD_OP_AVX2_FLAGS="-mavx2"
DistroBaker 51ea11
+                            AC_MSG_RESULT([yes])],
DistroBaker 51ea11
+                           [AC_MSG_RESULT([no])])
DistroBaker 51ea11
+                       CFLAGS="$op_avx_cflags_save"
DistroBaker 51ea11
+                       ])
DistroBaker 51ea11
+                  #
DistroBaker 51ea11
+                  # Some combination of gcc and older as would not correctly build the code generated by
DistroBaker 51ea11
+                  # _mm256_loadu_si256. Screen them out.
DistroBaker 51ea11
+                  #
DistroBaker 51ea11
+                  AS_IF([test $op_avx2_support -eq 1],
DistroBaker 51ea11
+                        [AC_MSG_CHECKING([if _mm256_loadu_si256 generates code that can be compiled])
DistroBaker 51ea11
+                         op_avx_cflags_save="$CFLAGS"
DistroBaker 51ea11
+                         CFLAGS="$CFLAGS_WITHOUT_OPTFLAGS -O0 $MCA_BUILD_OP_AVX2_FLAGS"
DistroBaker 51ea11
+                         AC_LINK_IFELSE(
DistroBaker 51ea11
+                             [AC_LANG_PROGRAM([[#include <immintrin.h>]],
DistroBaker 51ea11
+                                      [[
DistroBaker 51ea11
     int A[8] = {0, 1, 2, 3, 4, 5, 6, 7};
DistroBaker 51ea11
     __m256i vA = _mm256_loadu_si256((__m256i*)&A)
DistroBaker 51ea11
-                               ]])],
DistroBaker 51ea11
-                      [AC_MSG_RESULT([yes])],
DistroBaker 51ea11
-                      [op_avx2_support=0
DistroBaker 51ea11
-                       MCA_BUILD_OP_AVX2_FLAGS=""
DistroBaker 51ea11
-                       AC_MSG_RESULT([no])])
DistroBaker 51ea11
-                  CFLAGS="$op_avx_cflags_save"
DistroBaker 51ea11
-                 ])
DistroBaker 51ea11
+                                      ]])],
DistroBaker 51ea11
+                             [AC_MSG_RESULT([yes])],
DistroBaker 51ea11
+                             [op_avx2_support=0
DistroBaker 51ea11
+                              MCA_BUILD_OP_AVX2_FLAGS=""
DistroBaker 51ea11
+                              AC_MSG_RESULT([no])])
DistroBaker 51ea11
+                         CFLAGS="$op_avx_cflags_save"
DistroBaker 51ea11
+                        ])])
DistroBaker 51ea11
            #
DistroBaker 51ea11
            # What about early AVX support. The rest of the logic is slightly different as
DistroBaker 51ea11
            # we need to include some of the SSE4.1 and SSE3 instructions. So, we first check
DistroBaker 51ea11
@@ -160,90 +171,92 @@ AC_DEFUN([MCA_ompi_op_avx_CONFIG],[
DistroBaker 51ea11
            # the AVX flag, and then recheck if we have support for the SSE4.1 and SSE3
DistroBaker 51ea11
            # instructions.
DistroBaker 51ea11
            #
DistroBaker 51ea11
-           AC_MSG_CHECKING([for AVX support (no additional flags)])
DistroBaker 51ea11
-           AC_LINK_IFELSE(
DistroBaker 51ea11
-               [AC_LANG_PROGRAM([[#include <immintrin.h>]],
DistroBaker 51ea11
-                       [[
DistroBaker 51ea11
+           AC_CACHE_CHECK([if we are checking for AVX support], op_avx_check_avx, AS_VAR_SET(op_avx_check_avx, yes))
DistroBaker 51ea11
+           AS_IF([test "$op_avx_check_avx" = "yes"],
DistroBaker 51ea11
+                 [AC_MSG_CHECKING([for AVX support (no additional flags)])
DistroBaker 51ea11
+                  AC_LINK_IFELSE(
DistroBaker 51ea11
+                      [AC_LANG_PROGRAM([[#include <immintrin.h>]],
DistroBaker 51ea11
+                              [[
DistroBaker 51ea11
     __m128 vA, vB;
DistroBaker 51ea11
     _mm_add_ps(vA, vB)
DistroBaker 51ea11
-                       ]])],
DistroBaker 51ea11
-               [op_avx_support=1
DistroBaker 51ea11
-                AC_MSG_RESULT([yes])],
DistroBaker 51ea11
-               [AC_MSG_RESULT([no])])
DistroBaker 51ea11
+                              ]])],
DistroBaker 51ea11
+                      [op_avx_support=1
DistroBaker 51ea11
+                       AC_MSG_RESULT([yes])],
DistroBaker 51ea11
+                      [AC_MSG_RESULT([no])])])
DistroBaker 51ea11
            #
DistroBaker 51ea11
            # Check for SSE4.1 support
DistroBaker 51ea11
            #
DistroBaker 51ea11
-           AS_IF([test $op_avx_support -eq 1],
DistroBaker 51ea11
-               [AC_MSG_CHECKING([for SSE4.1 support])
DistroBaker 51ea11
-                AC_LINK_IFELSE(
DistroBaker 51ea11
-                    [AC_LANG_PROGRAM([[#include <immintrin.h>]],
DistroBaker 51ea11
-                            [[
DistroBaker 51ea11
+           AC_CACHE_CHECK([if we are checking for SSE4.1 support], op_avx_check_sse41, AS_VAR_SET(op_avx_check_sse41, yes))
DistroBaker 51ea11
+           AS_IF([test $op_avx_support -eq 1 && test "$op_avx_check_sse41" = "yes"],
DistroBaker 51ea11
+                 [AC_MSG_CHECKING([for SSE4.1 support])
DistroBaker 51ea11
+                  AC_LINK_IFELSE(
DistroBaker 51ea11
+                      [AC_LANG_PROGRAM([[#include <immintrin.h>]],
DistroBaker 51ea11
+                              [[
DistroBaker 51ea11
     __m128i vA, vB;
DistroBaker 51ea11
     (void)_mm_max_epi8(vA, vB)
DistroBaker 51ea11
-                            ]])],
DistroBaker 51ea11
-                    [op_sse41_support=1
DistroBaker 51ea11
-                     AC_MSG_RESULT([yes])],
DistroBaker 51ea11
-                    [AC_MSG_RESULT([no])])
DistroBaker 51ea11
-                ])
DistroBaker 51ea11
+                              ]])],
DistroBaker 51ea11
+                      [op_sse41_support=1
DistroBaker 51ea11
+                       AC_MSG_RESULT([yes])],
DistroBaker 51ea11
+                      [AC_MSG_RESULT([no])])
DistroBaker 51ea11
+                  ])
DistroBaker 51ea11
            #
DistroBaker 51ea11
            # Check for SSE3 support
DistroBaker 51ea11
            #
DistroBaker 51ea11
-           AS_IF([test $op_avx_support -eq 1],
DistroBaker 51ea11
-               [AC_MSG_CHECKING([for SSE3 support])
DistroBaker 51ea11
-                AC_LINK_IFELSE(
DistroBaker 51ea11
-                    [AC_LANG_PROGRAM([[#include <immintrin.h>]],
DistroBaker 51ea11
-                            [[
DistroBaker 51ea11
+           AC_CACHE_CHECK([if we are checking for SSE3 support], op_avx_check_sse3, AS_VAR_SET(op_avx_check_sse3, yes))
DistroBaker 51ea11
+           AS_IF([test $op_avx_support -eq 1 && test "$op_avx_check_sse3" = "yes"],
DistroBaker 51ea11
+                 [AC_MSG_CHECKING([for SSE3 support])
DistroBaker 51ea11
+                  AC_LINK_IFELSE(
DistroBaker 51ea11
+                      [AC_LANG_PROGRAM([[#include <immintrin.h>]],
DistroBaker 51ea11
+                              [[
DistroBaker 51ea11
     int A[4] = {0, 1, 2, 3};
DistroBaker 51ea11
     __m128i vA = _mm_lddqu_si128((__m128i*)&A)
DistroBaker 51ea11
-                            ]])],
DistroBaker 51ea11
-                    [op_sse3_support=1
DistroBaker 51ea11
-                     AC_MSG_RESULT([yes])],
DistroBaker 51ea11
-                    [AC_MSG_RESULT([no])])
DistroBaker 51ea11
-                ])
DistroBaker 51ea11
+                              ]])],
DistroBaker 51ea11
+                      [op_sse3_support=1
DistroBaker 51ea11
+                       AC_MSG_RESULT([yes])],
DistroBaker 51ea11
+                      [AC_MSG_RESULT([no])])
DistroBaker 51ea11
+                  ])
DistroBaker 51ea11
            # Second pass, do we need to add the AVX flag ?
DistroBaker 51ea11
            AS_IF([test $op_avx_support -eq 0 || test $op_sse41_support -eq 0 || test $op_sse3_support -eq 0],
DistroBaker 51ea11
-               [AC_MSG_CHECKING([for AVX support (with -mavx)])
DistroBaker 51ea11
-                op_avx_cflags_save="$CFLAGS"
DistroBaker 51ea11
-                CFLAGS="$CFLAGS -mavx"
DistroBaker 51ea11
-                AC_LINK_IFELSE(
DistroBaker 51ea11
-                    [AC_LANG_PROGRAM([[#include <immintrin.h>]],
DistroBaker 51ea11
-                            [[
DistroBaker 51ea11
+                 [AS_IF([test "$op_avx_check_avx" = "yes"],
DistroBaker 51ea11
+                        [AC_MSG_CHECKING([for AVX support (with -mavx)])
DistroBaker 51ea11
+                         op_avx_cflags_save="$CFLAGS"
DistroBaker 51ea11
+                         CFLAGS="-mavx $CFLAGS"
DistroBaker 51ea11
+                         AC_LINK_IFELSE(
DistroBaker 51ea11
+                             [AC_LANG_PROGRAM([[#include <immintrin.h>]],
DistroBaker 51ea11
+                                   [[
DistroBaker 51ea11
     __m128 vA, vB;
DistroBaker 51ea11
     _mm_add_ps(vA, vB)
DistroBaker 51ea11
                             ]])],
DistroBaker 51ea11
-                    [op_avx_support=1
DistroBaker 51ea11
-                     MCA_BUILD_OP_AVX_FLAGS="-mavx"
DistroBaker 51ea11
-                     op_sse41_support=0
DistroBaker 51ea11
-                     op_sse3_support=0
DistroBaker 51ea11
-                     AC_MSG_RESULT([yes])],
DistroBaker 51ea11
-                    [AC_MSG_RESULT([no])])
DistroBaker 51ea11
+                             [op_avx_support=1
DistroBaker 51ea11
+                              MCA_BUILD_OP_AVX_FLAGS="-mavx"
DistroBaker 51ea11
+                              op_sse41_support=0
DistroBaker 51ea11
+                              op_sse3_support=0
DistroBaker 51ea11
+                              AC_MSG_RESULT([yes])],
DistroBaker 51ea11
+                             [AC_MSG_RESULT([no])])])
DistroBaker 51ea11
 
DistroBaker 51ea11
-                AS_IF([test $op_sse41_support -eq 0],
DistroBaker 51ea11
-                    [AC_MSG_CHECKING([for SSE4.1 support])
DistroBaker 51ea11
-                     AC_LINK_IFELSE(
DistroBaker 51ea11
-                         [AC_LANG_PROGRAM([[#include <immintrin.h>]],
DistroBaker 51ea11
-                                 [[
DistroBaker 51ea11
+                  AS_IF([test "$op_avx_check_sse41" = "yes" && test $op_sse41_support -eq 0],
DistroBaker 51ea11
+                        [AC_MSG_CHECKING([for SSE4.1 support])
DistroBaker 51ea11
+                         AC_LINK_IFELSE(
DistroBaker 51ea11
+                             [AC_LANG_PROGRAM([[#include <immintrin.h>]],
DistroBaker 51ea11
+                                     [[
DistroBaker 51ea11
     __m128i vA, vB;
DistroBaker 51ea11
     (void)_mm_max_epi8(vA, vB)
DistroBaker 51ea11
-                                 ]])],
DistroBaker 51ea11
-                         [op_sse41_support=1
DistroBaker 51ea11
-                          AC_MSG_RESULT([yes])],
DistroBaker 51ea11
-                         [AC_MSG_RESULT([no])])
DistroBaker 51ea11
-                     ])
DistroBaker 51ea11
-                AS_IF([test $op_sse3_support -eq 0],
DistroBaker 51ea11
-                    [AC_MSG_CHECKING([for SSE3 support])
DistroBaker 51ea11
-                     AC_LINK_IFELSE(
DistroBaker 51ea11
-                         [AC_LANG_PROGRAM([[#include <immintrin.h>]],
DistroBaker 51ea11
+                                     ]])],
DistroBaker 51ea11
+                             [op_sse41_support=1
DistroBaker 51ea11
+                              AC_MSG_RESULT([yes])],
DistroBaker 51ea11
+                             [AC_MSG_RESULT([no])])])
DistroBaker 51ea11
+                  AS_IF([test "$op_avx_check_sse3" = "yes" && test $op_sse3_support -eq 0],
DistroBaker 51ea11
+                        [AC_MSG_CHECKING([for SSE3 support])
DistroBaker 51ea11
+                         AC_LINK_IFELSE(
DistroBaker 51ea11
+                             [AC_LANG_PROGRAM([[#include <immintrin.h>]],
DistroBaker 51ea11
                                  [[
DistroBaker 51ea11
     int A[4] = {0, 1, 2, 3};
DistroBaker 51ea11
     __m128i vA = _mm_lddqu_si128((__m128i*)&A)
DistroBaker 51ea11
                                  ]])],
DistroBaker 51ea11
-                         [op_sse3_support=1
DistroBaker 51ea11
-                          AC_MSG_RESULT([yes])],
DistroBaker 51ea11
-                         [AC_MSG_RESULT([no])])
DistroBaker 51ea11
-                     ])
DistroBaker 51ea11
-                CFLAGS="$op_avx_cflags_save"
DistroBaker 51ea11
-               ])
DistroBaker 51ea11
+                             [op_sse3_support=1
DistroBaker 51ea11
+                              AC_MSG_RESULT([yes])],
DistroBaker 51ea11
+                             [AC_MSG_RESULT([no])])])
DistroBaker 51ea11
+                  CFLAGS="$op_avx_cflags_save"])
DistroBaker 51ea11
 
DistroBaker 51ea11
            AC_LANG_POP([C])
DistroBaker 51ea11
           ])
DistroBaker 51ea11
@@ -276,6 +289,12 @@ AC_DEFUN([MCA_ompi_op_avx_CONFIG],[
DistroBaker 51ea11
     AC_SUBST(MCA_BUILD_OP_AVX2_FLAGS)
DistroBaker 51ea11
     AC_SUBST(MCA_BUILD_OP_AVX_FLAGS)
DistroBaker 51ea11
 
DistroBaker 51ea11
+    AS_VAR_POPDEF([op_avx_check_avx512])
DistroBaker 51ea11
+    AS_VAR_POPDEF([op_avx_check_avx2])
DistroBaker 51ea11
+    AS_VAR_POPDEF([op_avx_check_avx])
DistroBaker 51ea11
+    AS_VAR_POPDEF([op_avx_check_sse41])
DistroBaker 51ea11
+    AS_VAR_POPDEF([op_avx_check_sse3])
DistroBaker 51ea11
+
DistroBaker 51ea11
     OPAL_VAR_SCOPE_POP
DistroBaker 51ea11
     # Enable this component iff we have at least the most basic form of support
DistroBaker 51ea11
     # for vectorial ISA
DistroBaker 51ea11
DistroBaker 51ea11
From fcf2766a03e3c2a1001679013878209bcddd50ae Mon Sep 17 00:00:00 2001
DistroBaker 51ea11
From: George Bosilca <bosilca@icl.utk.edu>
DistroBaker 51ea11
Date: Mon, 28 Dec 2020 12:18:07 -0500
DistroBaker 51ea11
Subject: [PATCH 2/3] AVX code generation improvements
DistroBaker 51ea11
DistroBaker 51ea11
1. Allow fallback to a lesser AVX support during make
DistroBaker 51ea11
DistroBaker 51ea11
Due to the fact that some distro restrict the compiule architecture
DistroBaker 51ea11
during make (while not setting any restrictions during configure) we
DistroBaker 51ea11
need to detect the target architecture also during make in order to
DistroBaker 51ea11
restrict the code we generate.
DistroBaker 51ea11
DistroBaker 51ea11
2. Add comments and better protect the arch specific code.
DistroBaker 51ea11
DistroBaker 51ea11
Identify all the vectorial functions used and clasify them according to
DistroBaker 51ea11
the neccesary hardware capabilities.
DistroBaker 51ea11
Use these requirements to protect the code for load and stores (the rest
DistroBaker 51ea11
of the code being automatically generated it is more difficult to
DistroBaker 51ea11
protect).
DistroBaker 51ea11
DistroBaker 51ea11
3. Correctly check for AVX* support.
DistroBaker 51ea11
DistroBaker 51ea11
Signed-off-by: George Bosilca <bosilca@icl.utk.edu>
DistroBaker 51ea11
---
DistroBaker 51ea11
 ompi/mca/op/avx/configure.m4       |  28 +--
DistroBaker 51ea11
 ompi/mca/op/avx/op_avx_functions.c | 322 ++++++++++++++++++++++++-----
DistroBaker 51ea11
 2 files changed, 288 insertions(+), 62 deletions(-)
DistroBaker 51ea11
DistroBaker 51ea11
diff --git a/ompi/mca/op/avx/configure.m4 b/ompi/mca/op/avx/configure.m4
DistroBaker 51ea11
index f61b7100ef4..f3651f09d43 100644
DistroBaker 51ea11
--- a/ompi/mca/op/avx/configure.m4
DistroBaker 51ea11
+++ b/ompi/mca/op/avx/configure.m4
DistroBaker 51ea11
@@ -44,7 +44,7 @@ AC_DEFUN([MCA_ompi_op_avx_CONFIG],[
DistroBaker 51ea11
            #
DistroBaker 51ea11
            # Check for AVX512 support
DistroBaker 51ea11
            #
DistroBaker 51ea11
-           AC_CACHE_CHECK([if we are checking for AVX512 support], op_avx_check_avx512, AS_VAR_SET(op_avx_check_avx512, yes))
DistroBaker 51ea11
+           AC_CACHE_CHECK([for AVX512 support], op_avx_check_avx512, AS_VAR_SET(op_avx_check_avx512, yes))
DistroBaker 51ea11
            AS_IF([test "$op_avx_check_avx512" = "yes"],
DistroBaker 51ea11
                  [AC_MSG_CHECKING([for AVX512 support (no additional flags)])
DistroBaker 51ea11
                   AC_LINK_IFELSE(
DistroBaker 51ea11
@@ -115,14 +115,14 @@ AC_DEFUN([MCA_ompi_op_avx_CONFIG],[
DistroBaker 51ea11
            #
DistroBaker 51ea11
            # Check support for AVX2
DistroBaker 51ea11
            #
DistroBaker 51ea11
-           AC_CACHE_CHECK([if we are checking for AVX2 support], op_avx_check_avx2, AS_VAR_SET(op_avx_check_avx2, yes))
DistroBaker 51ea11
+           AC_CACHE_CHECK([for AVX2 support], op_avx_check_avx2, AS_VAR_SET(op_avx_check_avx2, yes))
DistroBaker 51ea11
            AS_IF([test "$op_avx_check_avx2" = "yes"],
DistroBaker 51ea11
                  [AC_MSG_CHECKING([for AVX2 support (no additional flags)])
DistroBaker 51ea11
                   AC_LINK_IFELSE(
DistroBaker 51ea11
                       [AC_LANG_PROGRAM([[#include <immintrin.h>]],
DistroBaker 51ea11
                               [[
DistroBaker 51ea11
-    __m256 vA, vB;
DistroBaker 51ea11
-    _mm256_add_ps(vA, vB)
DistroBaker 51ea11
+    __m256i vA, vB, vC;
DistroBaker 51ea11
+    vC = _mm256_and_si256(vA, vB)
DistroBaker 51ea11
                               ]])],
DistroBaker 51ea11
                       [op_avx2_support=1
DistroBaker 51ea11
                        AC_MSG_RESULT([yes])],
DistroBaker 51ea11
@@ -134,8 +134,8 @@ AC_DEFUN([MCA_ompi_op_avx_CONFIG],[
DistroBaker 51ea11
                        AC_LINK_IFELSE(
DistroBaker 51ea11
                            [AC_LANG_PROGRAM([[#include <immintrin.h>]],
DistroBaker 51ea11
                                    [[
DistroBaker 51ea11
-    __m256 vA, vB;
DistroBaker 51ea11
-    _mm256_add_ps(vA, vB)
DistroBaker 51ea11
+    __m256i vA, vB, vC;
DistroBaker 51ea11
+    vC = _mm256_and_si256(vA, vB)
DistroBaker 51ea11
                                    ]])],
DistroBaker 51ea11
                            [op_avx2_support=1
DistroBaker 51ea11
                             MCA_BUILD_OP_AVX2_FLAGS="-mavx2"
DistroBaker 51ea11
@@ -164,21 +164,21 @@ AC_DEFUN([MCA_ompi_op_avx_CONFIG],[
DistroBaker 51ea11
                          CFLAGS="$op_avx_cflags_save"
DistroBaker 51ea11
                         ])])
DistroBaker 51ea11
            #
DistroBaker 51ea11
-           # What about early AVX support. The rest of the logic is slightly different as
DistroBaker 51ea11
+           # What about early AVX support? The rest of the logic is slightly different as
DistroBaker 51ea11
            # we need to include some of the SSE4.1 and SSE3 instructions. So, we first check
DistroBaker 51ea11
            # if we can compile AVX code without a flag, then we validate that we have support
DistroBaker 51ea11
            # for the SSE4.1 and SSE3 instructions we need. If not, we check for the usage of
DistroBaker 51ea11
            # the AVX flag, and then recheck if we have support for the SSE4.1 and SSE3
DistroBaker 51ea11
            # instructions.
DistroBaker 51ea11
            #
DistroBaker 51ea11
-           AC_CACHE_CHECK([if we are checking for AVX support], op_avx_check_avx, AS_VAR_SET(op_avx_check_avx, yes))
DistroBaker 51ea11
+           AC_CACHE_CHECK([for AVX support], op_avx_check_avx, AS_VAR_SET(op_avx_check_avx, yes))
DistroBaker 51ea11
            AS_IF([test "$op_avx_check_avx" = "yes"],
DistroBaker 51ea11
                  [AC_MSG_CHECKING([for AVX support (no additional flags)])
DistroBaker 51ea11
                   AC_LINK_IFELSE(
DistroBaker 51ea11
                       [AC_LANG_PROGRAM([[#include <immintrin.h>]],
DistroBaker 51ea11
                               [[
DistroBaker 51ea11
-    __m128 vA, vB;
DistroBaker 51ea11
-    _mm_add_ps(vA, vB)
DistroBaker 51ea11
+    __m256 vA, vB, vC;
DistroBaker 51ea11
+    vC = _mm256_add_ps(vA, vB)
DistroBaker 51ea11
                               ]])],
DistroBaker 51ea11
                       [op_avx_support=1
DistroBaker 51ea11
                        AC_MSG_RESULT([yes])],
DistroBaker 51ea11
@@ -186,7 +186,7 @@ AC_DEFUN([MCA_ompi_op_avx_CONFIG],[
DistroBaker 51ea11
            #
DistroBaker 51ea11
            # Check for SSE4.1 support
DistroBaker 51ea11
            #
DistroBaker 51ea11
-           AC_CACHE_CHECK([if we are checking for SSE4.1 support], op_avx_check_sse41, AS_VAR_SET(op_avx_check_sse41, yes))
DistroBaker 51ea11
+           AC_CACHE_CHECK([for SSE4.1 support], op_avx_check_sse41, AS_VAR_SET(op_avx_check_sse41, yes))
DistroBaker 51ea11
            AS_IF([test $op_avx_support -eq 1 && test "$op_avx_check_sse41" = "yes"],
DistroBaker 51ea11
                  [AC_MSG_CHECKING([for SSE4.1 support])
DistroBaker 51ea11
                   AC_LINK_IFELSE(
DistroBaker 51ea11
@@ -202,7 +202,7 @@ AC_DEFUN([MCA_ompi_op_avx_CONFIG],[
DistroBaker 51ea11
            #
DistroBaker 51ea11
            # Check for SSE3 support
DistroBaker 51ea11
            #
DistroBaker 51ea11
-           AC_CACHE_CHECK([if we are checking for SSE3 support], op_avx_check_sse3, AS_VAR_SET(op_avx_check_sse3, yes))
DistroBaker 51ea11
+           AC_CACHE_CHECK([for SSE3 support], op_avx_check_sse3, AS_VAR_SET(op_avx_check_sse3, yes))
DistroBaker 51ea11
            AS_IF([test $op_avx_support -eq 1 && test "$op_avx_check_sse3" = "yes"],
DistroBaker 51ea11
                  [AC_MSG_CHECKING([for SSE3 support])
DistroBaker 51ea11
                   AC_LINK_IFELSE(
DistroBaker 51ea11
@@ -224,8 +224,8 @@ AC_DEFUN([MCA_ompi_op_avx_CONFIG],[
DistroBaker 51ea11
                          AC_LINK_IFELSE(
DistroBaker 51ea11
                              [AC_LANG_PROGRAM([[#include <immintrin.h>]],
DistroBaker 51ea11
                                    [[
DistroBaker 51ea11
-    __m128 vA, vB;
DistroBaker 51ea11
-    _mm_add_ps(vA, vB)
DistroBaker 51ea11
+    __m256 vA, vB, vC;
DistroBaker 51ea11
+    vC = _mm256_add_ps(vA, vB)
DistroBaker 51ea11
                             ]])],
DistroBaker 51ea11
                              [op_avx_support=1
DistroBaker 51ea11
                               MCA_BUILD_OP_AVX_FLAGS="-mavx"
DistroBaker 51ea11
diff --git a/ompi/mca/op/avx/op_avx_functions.c b/ompi/mca/op/avx/op_avx_functions.c
DistroBaker 51ea11
index 95a9c9ab84e..ef3f0932906 100644
DistroBaker 51ea11
--- a/ompi/mca/op/avx/op_avx_functions.c
DistroBaker 51ea11
+++ b/ompi/mca/op/avx/op_avx_functions.c
DistroBaker 51ea11
@@ -1,5 +1,5 @@
DistroBaker 51ea11
 /*
DistroBaker 51ea11
- * Copyright (c) 2019-2020 The University of Tennessee and The University
DistroBaker 51ea11
+ * Copyright (c) 2019-2021 The University of Tennessee and The University
DistroBaker 51ea11
  *                         of Tennessee Research Foundation.  All rights
DistroBaker 51ea11
  *                         reserved.
DistroBaker 51ea11
  * Copyright (c) 2020      Research Organization for Information Science
DistroBaker 51ea11
@@ -24,16 +24,42 @@
DistroBaker 51ea11
 #include "ompi/mca/op/avx/op_avx.h"
DistroBaker 51ea11
 
DistroBaker 51ea11
 #include <immintrin.h>
DistroBaker 51ea11
-
DistroBaker 51ea11
+/**
DistroBaker 51ea11
+ * The following logic is necessary to cope with distro maintainer's desire to change the compilation
DistroBaker 51ea11
+ * flags after the configure step, leading to inconsistencies between what OMPI has detected and what
DistroBaker 51ea11
+ * code can be generated during make. If we detect that the current code generation architecture has
DistroBaker 51ea11
+ * been changed from our own setting and cannot generate the code we need (AVX512, AVX2) we fall back
DistroBaker 51ea11
+ * to a lesser support (AVX512 -> AVX2, AVX2 -> AVX, AVX -> error out).
DistroBaker 51ea11
+ */
DistroBaker 51ea11
 #if defined(GENERATE_AVX512_CODE)
DistroBaker 51ea11
-#define PREPEND _avx512
DistroBaker 51ea11
-#elif defined(GENERATE_AVX2_CODE)
DistroBaker 51ea11
-#define PREPEND _avx2
DistroBaker 51ea11
-#elif defined(GENERATE_AVX_CODE)
DistroBaker 51ea11
-#define PREPEND _avx
DistroBaker 51ea11
-#else
DistroBaker 51ea11
-#error This file should not be compiled in this conditions
DistroBaker 51ea11
-#endif
DistroBaker 51ea11
+#  if defined(__AVX512BW__) && defined(__AVX512F__) && defined(__AVX512VL__)
DistroBaker 51ea11
+#    define PREPEND _avx512
DistroBaker 51ea11
+#  else
DistroBaker 51ea11
+#    undef GENERATE_AVX512_CODE
DistroBaker 51ea11
+#  endif  /* defined(__AVX512BW__) && defined(__AVX512F__) && defined(__AVX512VL__) */
DistroBaker 51ea11
+#endif  /* defined(GENERATE_AVX512_CODE) */
DistroBaker 51ea11
+
DistroBaker 51ea11
+#if !defined(PREPEND) && defined(GENERATE_AVX2_CODE)
DistroBaker 51ea11
+#  if defined(__AVX2__)
DistroBaker 51ea11
+#    define PREPEND _avx2
DistroBaker 51ea11
+#  else
DistroBaker 51ea11
+#    undef GENERATE_AVX2_CODE
DistroBaker 51ea11
+#  endif  /* defined(__AVX2__) */
DistroBaker 51ea11
+#endif  /* !defined(PREPEND) && defined(GENERATE_AVX2_CODE) */
DistroBaker 51ea11
+
DistroBaker 51ea11
+#if !defined(PREPEND) && defined(GENERATE_AVX_CODE)
DistroBaker 51ea11
+#  if defined(__AVX__)
DistroBaker 51ea11
+#    define PREPEND _avx
DistroBaker 51ea11
+#  endif
DistroBaker 51ea11
+#endif  /* !defined(PREPEND) && defined(GENERATE_AVX_CODE) */
DistroBaker 51ea11
+
DistroBaker 51ea11
+#if !defined(PREPEND)
DistroBaker 51ea11
+#  if OMPI_MCA_OP_HAVE_AVX512 || OMPI_MCA_OP_HAVE_AVX2
DistroBaker 51ea11
+#    error The configure step has detected possible support for AVX512 and/or AVX2 but the compiler flags during make are too restrictive. Please disable the AVX component by adding --enable-mca-no-build=op-avx to your configure step.
DistroBaker 51ea11
+#  else
DistroBaker 51ea11
+#    error This file should not be compiled in this conditions. Please provide the config.log file to the OMPI developers.
DistroBaker 51ea11
+#  endif  /* OMPI_MCA_OP_HAVE_AVX512 || OMPI_MCA_OP_HAVE_AVX2 */
DistroBaker 51ea11
+#endif  /* !defined(PREPEND) */
DistroBaker 51ea11
 
DistroBaker 51ea11
 /*
DistroBaker 51ea11
  * Concatenate preprocessor tokens A and B without expanding macro definitions
DistroBaker 51ea11
@@ -46,6 +72,102 @@
DistroBaker 51ea11
  */
DistroBaker 51ea11
 #define OP_CONCAT(A, B) OP_CONCAT_NX(A, B)
DistroBaker 51ea11
 
DistroBaker 51ea11
+/*
DistroBaker 51ea11
+ * grep -e "_mm[125][251][862]_.*(" avx512.c -o | sed 's/(//g' | sort | uniq
DistroBaker 51ea11
+ *
DistroBaker 51ea11
+ * https://software.intel.com/sites/landingpage/IntrinsicsGuide
DistroBaker 51ea11
+ *
DistroBaker 51ea11
+ * _mm_add_epi[8,16,32,64]         SSE2
DistroBaker 51ea11
+ * _mm_add_pd                      SSE2
DistroBaker 51ea11
+ * _mm_add_ps                      SSE
DistroBaker 51ea11
+ * _mm_adds_epi[8,16]              SSE2
DistroBaker 51ea11
+ * _mm_adds_epu[8,16]              SSE2
DistroBaker 51ea11
+ * _mm_and_si128                   SSE2
DistroBaker 51ea11
+ * _mm_lddqu_si128                 SSE3
DistroBaker 51ea11
+ * _mm_loadu_pd                    SSE2
DistroBaker 51ea11
+ * _mm_loadu_ps                    SSE
DistroBaker 51ea11
+ * _mm_max_epi8                    SSE4.1
DistroBaker 51ea11
+ * _mm_max_epi16                   SSE2
DistroBaker 51ea11
+ * _mm_max_epi32                   SSE4.1
DistroBaker 51ea11
+ * _mm_max_epi64                   AVX512VL + AVX512F
DistroBaker 51ea11
+ * _mm_max_epu8                    SSE2
DistroBaker 51ea11
+ * _mm_max_epu[16,32]              SSE4.1
DistroBaker 51ea11
+ * _mm_max_epu64                   AVX512VL + AVX512F
DistroBaker 51ea11
+ * _mm_max_pd                      SSE2
DistroBaker 51ea11
+ * _mm_max_ps                      SSE
DistroBaker 51ea11
+ * _mm_min_epi8                    SSE4.1
DistroBaker 51ea11
+ * _mm_min_epi16                   SSE2
DistroBaker 51ea11
+ * _mm_min_epi32                   SSE4.1
DistroBaker 51ea11
+ * _mm_min_epi64                   AVX512VL + AVX512F
DistroBaker 51ea11
+ * _mm_min_epu8                    SSE2
DistroBaker 51ea11
+ * _mm_min_epu[16,32]              SSE4.1
DistroBaker 51ea11
+ * _mm_min_epu64                   AVX512VL + AVX512F
DistroBaker 51ea11
+ * _mm_min_pd                      SSE2
DistroBaker 51ea11
+ * _mm_min_ps                      SSE
DistroBaker 51ea11
+ * _mm_mul_pd                      SSE2
DistroBaker 51ea11
+ * _mm_mul_ps                      SSE
DistroBaker 51ea11
+ * _mm_mullo_epi16                 SSE2
DistroBaker 51ea11
+ * _mm_mullo_epi32                 SSE4.1
DistroBaker 51ea11
+ * _mm_mullo_epi64                 AVX512VL + AVX512DQ
DistroBaker 51ea11
+ * _mm_or_si128                    SSE2
DistroBaker 51ea11
+ * _mm_storeu_pd                   SSE2
DistroBaker 51ea11
+ * _mm_storeu_ps                   SSE
DistroBaker 51ea11
+ * _mm_storeu_si128                SSE2
DistroBaker 51ea11
+ * _mm_xor_si128                   SSE2
DistroBaker 51ea11
+ * _mm256_add_epi[8,16,32,64]      AVX2
DistroBaker 51ea11
+ * _mm256_add_p[s,d]               AVX
DistroBaker 51ea11
+ * _mm256_adds_epi[8,16]           AVX2
DistroBaker 51ea11
+ * _mm256_adds_epu[8,16]           AVX2
DistroBaker 51ea11
+ * _mm256_and_si256                AVX2
DistroBaker 51ea11
+ * _mm256_loadu_p[s,d]             AVX
DistroBaker 51ea11
+ * _mm256_loadu_si256              AVX
DistroBaker 51ea11
+ * _mm256_max_epi[8,16,32]         AVX2
DistroBaker 51ea11
+ * _mm256_max_epi64                AVX512VL + AVX512F
DistroBaker 51ea11
+ * _mm256_max_epu[8,16,32]         AVX2
DistroBaker 51ea11
+ * _mm256_max_epu64                AVX512VL + AVX512F
DistroBaker 51ea11
+ * _mm256_max_p[s,d]               AVX
DistroBaker 51ea11
+ * _mm256_min_epi[8,16,32]         AVX2
DistroBaker 51ea11
+ * _mm256_min_epi64                AVX512VL + AVX512F
DistroBaker 51ea11
+ * _mm256_min_epu[8,16,32]         AVX2
DistroBaker 51ea11
+ * _mm256_min_epu64                AVX512VL + AVX512F
DistroBaker 51ea11
+ * _mm256_min_p[s,d]               AVX
DistroBaker 51ea11
+ * _mm256_mul_p[s,d]               AVX
DistroBaker 51ea11
+ * _mm256_mullo_epi[16,32]         AVX2
DistroBaker 51ea11
+ * _mm256_mullo_epi64              AVX512VL + AVX512DQ
DistroBaker 51ea11
+ * _mm256_or_si256                 AVX2
DistroBaker 51ea11
+ * _mm256_storeu_p[s,d]            AVX
DistroBaker 51ea11
+ * _mm256_storeu_si256             AVX
DistroBaker 51ea11
+ * _mm256_xor_si256                AVX2
DistroBaker 51ea11
+ * _mm512_add_epi[8,16]            AVX512BW
DistroBaker 51ea11
+ * _mm512_add_epi[32,64]           AVX512F
DistroBaker 51ea11
+ * _mm512_add_p[s,d]               AVX512F
DistroBaker 51ea11
+ * _mm512_adds_epi[8,16]           AVX512BW
DistroBaker 51ea11
+ * _mm512_adds_epu[8,16]           AVX512BW
DistroBaker 51ea11
+ * _mm512_and_si512                AVX512F
DistroBaker 51ea11
+ * _mm512_cvtepi16_epi8            AVX512BW
DistroBaker 51ea11
+ * _mm512_cvtepi8_epi16            AVX512BW
DistroBaker 51ea11
+ * _mm512_loadu_p[s,d]             AVX512F
DistroBaker 51ea11
+ * _mm512_loadu_si512              AVX512F
DistroBaker 51ea11
+ * _mm512_max_epi[8,16]            AVX512BW
DistroBaker 51ea11
+ * _mm512_max_epi[32,64]           AVX512F
DistroBaker 51ea11
+ * _mm512_max_epu[8,16]            AVX512BW
DistroBaker 51ea11
+ * _mm512_max_epu[32,64]           AVX512F
DistroBaker 51ea11
+ * _mm512_max_p[s,d]               AVX512F
DistroBaker 51ea11
+ * _mm512_min_epi[8,16]            AVX512BW
DistroBaker 51ea11
+ * _mm512_min_epi[32,64]           AVX512F
DistroBaker 51ea11
+ * _mm512_min_epu[8,16]            AVX512BW
DistroBaker 51ea11
+ * _mm512_min_epu[32,64]           AVX512F
DistroBaker 51ea11
+ * _mm512_min_p[s,d]               AVX512F
DistroBaker 51ea11
+ * _mm512_mul_p[s,d]               AVX512F
DistroBaker 51ea11
+ * _mm512_mullo_epi16              AVX512BW
DistroBaker 51ea11
+ * _mm512_mullo_epi32              AVX512F
DistroBaker 51ea11
+ * _mm512_mullo_epi64              AVX512DQ
DistroBaker 51ea11
+ * _mm512_or_si512                 AVX512F
DistroBaker 51ea11
+ * _mm512_storeu_p[s,d]            AVX512F
DistroBaker 51ea11
+ * _mm512_storeu_si512             AVX512F
DistroBaker 51ea11
+ * _mm512_xor_si512                AVX512F
DistroBaker 51ea11
+ */
DistroBaker 51ea11
+
DistroBaker 51ea11
 /*
DistroBaker 51ea11
  * Since all the functions in this file are essentially identical, we
DistroBaker 51ea11
  * use a macro to substitute in names and types.  The core operation
DistroBaker 51ea11
@@ -62,13 +184,14 @@
DistroBaker 51ea11
   (((_flag) & mca_op_avx_component.flags) == (_flag))
DistroBaker 51ea11
 
DistroBaker 51ea11
 #if defined(GENERATE_AVX512_CODE) && defined(OMPI_MCA_OP_HAVE_AVX512) && (1 == OMPI_MCA_OP_HAVE_AVX512)
DistroBaker 51ea11
+#if __AVX512F__
DistroBaker 51ea11
 #define OP_AVX_AVX512_FUNC(name, type_sign, type_size, type, op)               \
DistroBaker 51ea11
     if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_AVX512F_FLAG|OMPI_OP_AVX_HAS_AVX512BW_FLAG) ) { \
DistroBaker 51ea11
         int types_per_step = (512 / 8) / sizeof(type);                         \
DistroBaker 51ea11
         for( ; left_over >= types_per_step; left_over -= types_per_step ) {    \
DistroBaker 51ea11
-            __m512i vecA =  _mm512_loadu_si512((__m512*)in);                   \
DistroBaker 51ea11
+            __m512i vecA = _mm512_loadu_si512((__m512*)in);                    \
DistroBaker 51ea11
             in += types_per_step;                                              \
DistroBaker 51ea11
-            __m512i vecB =  _mm512_loadu_si512((__m512*)out);                  \
DistroBaker 51ea11
+            __m512i vecB = _mm512_loadu_si512((__m512*)out);                   \
DistroBaker 51ea11
             __m512i res = _mm512_##op##_ep##type_sign##type_size(vecA, vecB);  \
DistroBaker 51ea11
             _mm512_storeu_si512((__m512*)out, res);                            \
DistroBaker 51ea11
             out += types_per_step;                                             \
DistroBaker 51ea11
@@ -76,10 +199,14 @@
DistroBaker 51ea11
         if( 0 == left_over ) return;                                           \
DistroBaker 51ea11
     }
DistroBaker 51ea11
 #else
DistroBaker 51ea11
+#error Target architecture lacks AVX512F support needed for _mm512_loadu_si512 and _mm512_storeu_si512
DistroBaker 51ea11
+#endif  /* __AVX512F__ */
DistroBaker 51ea11
+#else
DistroBaker 51ea11
 #define OP_AVX_AVX512_FUNC(name, type_sign, type_size, type, op) {}
DistroBaker 51ea11
 #endif  /* defined(OMPI_MCA_OP_HAVE_AVX512) && (1 == OMPI_MCA_OP_HAVE_AVX512) */
DistroBaker 51ea11
 
DistroBaker 51ea11
 #if defined(GENERATE_AVX2_CODE) && defined(OMPI_MCA_OP_HAVE_AVX2) && (1 == OMPI_MCA_OP_HAVE_AVX2)
DistroBaker 51ea11
+#if __AVX__
DistroBaker 51ea11
 #define OP_AVX_AVX2_FUNC(name, type_sign, type_size, type, op)                 \
DistroBaker 51ea11
     if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_AVX2_FLAG | OMPI_OP_AVX_HAS_AVX_FLAG) ) {  \
DistroBaker 51ea11
         int types_per_step = (256 / 8) / sizeof(type);  /* AVX2 */             \
DistroBaker 51ea11
@@ -87,30 +214,37 @@
DistroBaker 51ea11
             __m256i vecA = _mm256_loadu_si256((__m256i*)in);                   \
DistroBaker 51ea11
             in += types_per_step;                                              \
DistroBaker 51ea11
             __m256i vecB = _mm256_loadu_si256((__m256i*)out);                  \
DistroBaker 51ea11
-            __m256i res =  _mm256_##op##_ep##type_sign##type_size(vecA, vecB); \
DistroBaker 51ea11
+            __m256i res = _mm256_##op##_ep##type_sign##type_size(vecA, vecB);  \
DistroBaker 51ea11
             _mm256_storeu_si256((__m256i*)out, res);                           \
DistroBaker 51ea11
             out += types_per_step;                                             \
DistroBaker 51ea11
         }                                                                      \
DistroBaker 51ea11
         if( 0 == left_over ) return;                                           \
DistroBaker 51ea11
     }
DistroBaker 51ea11
 #else
DistroBaker 51ea11
+#error Target architecture lacks AVX support needed for _mm256_loadu_si256 and _mm256_storeu_si256
DistroBaker 51ea11
+#endif  /* __AVX__ */
DistroBaker 51ea11
+#else
DistroBaker 51ea11
 #define OP_AVX_AVX2_FUNC(name, type_sign, type_size, type, op) {}
DistroBaker 51ea11
 #endif  /* defined(OMPI_MCA_OP_HAVE_AVX2) && (1 == OMPI_MCA_OP_HAVE_AVX2) */
DistroBaker 51ea11
 
DistroBaker 51ea11
 #if defined(GENERATE_SSE3_CODE) && defined(OMPI_MCA_OP_HAVE_AVX) && (1 == OMPI_MCA_OP_HAVE_AVX)
DistroBaker 51ea11
+#if __SSE3__
DistroBaker 51ea11
 #define OP_AVX_SSE4_1_FUNC(name, type_sign, type_size, type, op)               \
DistroBaker 51ea11
     if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_SSE3_FLAG | OMPI_OP_AVX_HAS_SSE4_1_FLAG) ) { \
DistroBaker 51ea11
-        int types_per_step = (128 / 8) / sizeof(type);  /* AVX */              \
DistroBaker 51ea11
+        int types_per_step = (128 / 8) / sizeof(type);                         \
DistroBaker 51ea11
         for( ; left_over >= types_per_step; left_over -= types_per_step ) {    \
DistroBaker 51ea11
             __m128i vecA = _mm_lddqu_si128((__m128i*)in);                      \
DistroBaker 51ea11
             in += types_per_step;                                              \
DistroBaker 51ea11
             __m128i vecB = _mm_lddqu_si128((__m128i*)out);                     \
DistroBaker 51ea11
-            __m128i res =  _mm_##op##_ep##type_sign##type_size(vecA, vecB);    \
DistroBaker 51ea11
+            __m128i res = _mm_##op##_ep##type_sign##type_size(vecA, vecB);     \
DistroBaker 51ea11
             _mm_storeu_si128((__m128i*)out, res);                              \
DistroBaker 51ea11
             out += types_per_step;                                             \
DistroBaker 51ea11
         }                                                                      \
DistroBaker 51ea11
     }
DistroBaker 51ea11
 #else
DistroBaker 51ea11
+#error Target architecture lacks SSE3 support needed for _mm_lddqu_si128 and _mm_storeu_si128
DistroBaker 51ea11
+#endif  /* __SSE3__ */
DistroBaker 51ea11
+#else
DistroBaker 51ea11
 #define OP_AVX_SSE4_1_FUNC(name, type_sign, type_size, type, op) {}
DistroBaker 51ea11
 #endif  /* defined(OMPI_MCA_OP_HAVE_AVX) && (1 == OMPI_MCA_OP_HAVE_AVX) */
DistroBaker 51ea11
 
DistroBaker 51ea11
@@ -143,12 +277,13 @@ static void OP_CONCAT(ompi_op_avx_2buff_##name##_##type,PREPEND)(const void *_in
DistroBaker 51ea11
 }
DistroBaker 51ea11
 
DistroBaker 51ea11
 #if defined(GENERATE_AVX512_CODE) && defined(OMPI_MCA_OP_HAVE_AVX512) && (1 == OMPI_MCA_OP_HAVE_AVX512)
DistroBaker 51ea11
+#if __AVX512BW__ && __AVX__
DistroBaker 51ea11
 #define OP_AVX_AVX512_MUL(name, type_sign, type_size, type, op)         \
DistroBaker 51ea11
     if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_AVX512F_FLAG | OMPI_OP_AVX_HAS_AVX512BW_FLAG) ) {  \
DistroBaker 51ea11
         int types_per_step = (256 / 8) / sizeof(type);                  \
DistroBaker 51ea11
         for (; left_over >= types_per_step; left_over -= types_per_step) { \
DistroBaker 51ea11
-            __m256i vecA_tmp =  _mm256_loadu_si256((__m256i*)in);       \
DistroBaker 51ea11
-            __m256i vecB_tmp =  _mm256_loadu_si256((__m256i*)out);      \
DistroBaker 51ea11
+            __m256i vecA_tmp = _mm256_loadu_si256((__m256i*)in);        \
DistroBaker 51ea11
+            __m256i vecB_tmp = _mm256_loadu_si256((__m256i*)out);       \
DistroBaker 51ea11
             in += types_per_step;                                       \
DistroBaker 51ea11
             __m512i vecA = _mm512_cvtepi8_epi16(vecA_tmp);              \
DistroBaker 51ea11
             __m512i vecB = _mm512_cvtepi8_epi16(vecB_tmp);              \
DistroBaker 51ea11
@@ -160,6 +295,9 @@ static void OP_CONCAT(ompi_op_avx_2buff_##name##_##type,PREPEND)(const void *_in
DistroBaker 51ea11
         if( 0 == left_over ) return;                                    \
DistroBaker 51ea11
     }
DistroBaker 51ea11
 #else
DistroBaker 51ea11
+#error Target architecture lacks AVX512BW and AVX support needed for _mm256_loadu_si256, _mm256_storeu_si256 and _mm512_cvtepi8_epi16
DistroBaker 51ea11
+#endif  /* __AVX512BW__ && __AVX__ */
DistroBaker 51ea11
+#else
DistroBaker 51ea11
 #define OP_AVX_AVX512_MUL(name, type_sign, type_size, type, op) {}
DistroBaker 51ea11
 #endif  /* defined(OMPI_MCA_OP_HAVE_AVX512) && (1 == OMPI_MCA_OP_HAVE_AVX512) */
DistroBaker 51ea11
 /**
DistroBaker 51ea11
@@ -201,13 +339,14 @@ static void OP_CONCAT( ompi_op_avx_2buff_##name##_##type, PREPEND)(const void *_
DistroBaker 51ea11
  *
DistroBaker 51ea11
  */
DistroBaker 51ea11
 #if defined(GENERATE_AVX512_CODE) && defined(OMPI_MCA_OP_HAVE_AVX512) && (1 == OMPI_MCA_OP_HAVE_AVX512)
DistroBaker 51ea11
+#if __AVX512F__
DistroBaker 51ea11
 #define OP_AVX_AVX512_BIT_FUNC(name, type_size, type, op)               \
DistroBaker 51ea11
     if( OMPI_OP_AVX_HAS_FLAGS( OMPI_OP_AVX_HAS_AVX512F_FLAG) ) {        \
DistroBaker 51ea11
         types_per_step = (512 / 8) / sizeof(type);                      \
DistroBaker 51ea11
         for (; left_over >= types_per_step; left_over -= types_per_step) { \
DistroBaker 51ea11
-            __m512i vecA =  _mm512_loadu_si512((__m512i*)in);           \
DistroBaker 51ea11
+            __m512i vecA = _mm512_loadu_si512((__m512i*)in);            \
DistroBaker 51ea11
             in += types_per_step;                                       \
DistroBaker 51ea11
-            __m512i vecB =  _mm512_loadu_si512((__m512i*)out);          \
DistroBaker 51ea11
+            __m512i vecB = _mm512_loadu_si512((__m512i*)out);           \
DistroBaker 51ea11
             __m512i res = _mm512_##op##_si512(vecA, vecB);              \
DistroBaker 51ea11
             _mm512_storeu_si512((__m512i*)out, res);                    \
DistroBaker 51ea11
             out += types_per_step;                                      \
DistroBaker 51ea11
@@ -215,10 +354,14 @@ static void OP_CONCAT( ompi_op_avx_2buff_##name##_##type, PREPEND)(const void *_
DistroBaker 51ea11
         if( 0 == left_over ) return;                                    \
DistroBaker 51ea11
     }
DistroBaker 51ea11
 #else
DistroBaker 51ea11
+#error Target architecture lacks AVX512F support needed for _mm512_loadu_si512 and _mm512_storeu_si512
DistroBaker 51ea11
+#endif  /* __AVX512F__ */
DistroBaker 51ea11
+#else
DistroBaker 51ea11
 #define OP_AVX_AVX512_BIT_FUNC(name, type_size, type, op) {}
DistroBaker 51ea11
 #endif  /* defined(OMPI_MCA_OP_HAVE_AVX512) && (1 == OMPI_MCA_OP_HAVE_AVX512) */
DistroBaker 51ea11
 
DistroBaker 51ea11
 #if defined(GENERATE_AVX2_CODE) && defined(OMPI_MCA_OP_HAVE_AVX2) && (1 == OMPI_MCA_OP_HAVE_AVX2)
DistroBaker 51ea11
+#if __AVX__
DistroBaker 51ea11
 #define OP_AVX_AVX2_BIT_FUNC(name, type_size, type, op)                 \
DistroBaker 51ea11
     if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_AVX2_FLAG | OMPI_OP_AVX_HAS_AVX_FLAG) ) { \
DistroBaker 51ea11
         types_per_step = (256 / 8) / sizeof(type);                      \
DistroBaker 51ea11
@@ -226,17 +369,21 @@ static void OP_CONCAT( ompi_op_avx_2buff_##name##_##type, PREPEND)(const void *_
DistroBaker 51ea11
             __m256i vecA = _mm256_loadu_si256((__m256i*)in);            \
DistroBaker 51ea11
             in += types_per_step;                                       \
DistroBaker 51ea11
             __m256i vecB = _mm256_loadu_si256((__m256i*)out);           \
DistroBaker 51ea11
-            __m256i res =  _mm256_##op##_si256(vecA, vecB);             \
DistroBaker 51ea11
+            __m256i res = _mm256_##op##_si256(vecA, vecB);              \
DistroBaker 51ea11
             _mm256_storeu_si256((__m256i*)out, res);                    \
DistroBaker 51ea11
             out += types_per_step;                                      \
DistroBaker 51ea11
         }                                                               \
DistroBaker 51ea11
         if( 0 == left_over ) return;                                    \
DistroBaker 51ea11
     }
DistroBaker 51ea11
 #else
DistroBaker 51ea11
+#error Target architecture lacks AVX support needed for _mm256_loadu_si256 and _mm256_storeu_si256
DistroBaker 51ea11
+#endif  /* __AVX__ */
DistroBaker 51ea11
+#else
DistroBaker 51ea11
 #define OP_AVX_AVX2_BIT_FUNC(name, type_size, type, op) {}
DistroBaker 51ea11
 #endif  /* defined(OMPI_MCA_OP_HAVE_AVX2) && (1 == OMPI_MCA_OP_HAVE_AVX2) */
DistroBaker 51ea11
 
DistroBaker 51ea11
 #if defined(GENERATE_SSE3_CODE) && defined(OMPI_MCA_OP_HAVE_AVX) && (1 == OMPI_MCA_OP_HAVE_AVX)
DistroBaker 51ea11
+#if __SSE3__ && __SSE2__
DistroBaker 51ea11
 #define OP_AVX_SSE3_BIT_FUNC(name, type_size, type, op)                 \
DistroBaker 51ea11
     if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_SSE3_FLAG) ) {            \
DistroBaker 51ea11
         types_per_step = (128 / 8) / sizeof(type);                      \
DistroBaker 51ea11
@@ -244,12 +391,15 @@ static void OP_CONCAT( ompi_op_avx_2buff_##name##_##type, PREPEND)(const void *_
DistroBaker 51ea11
             __m128i vecA = _mm_lddqu_si128((__m128i*)in);               \
DistroBaker 51ea11
             in += types_per_step;                                       \
DistroBaker 51ea11
             __m128i vecB = _mm_lddqu_si128((__m128i*)out);              \
DistroBaker 51ea11
-            __m128i res =  _mm_##op##_si128(vecA, vecB);                \
DistroBaker 51ea11
+            __m128i res = _mm_##op##_si128(vecA, vecB);                 \
DistroBaker 51ea11
             _mm_storeu_si128((__m128i*)out, res);                       \
DistroBaker 51ea11
             out += types_per_step;                                      \
DistroBaker 51ea11
         }                                                               \
DistroBaker 51ea11
     }
DistroBaker 51ea11
 #else
DistroBaker 51ea11
+#error Target architecture lacks SSE2 and SSE3 support needed for _mm_lddqu_si128 and _mm_storeu_si128
DistroBaker 51ea11
+#endif  /* __SSE3__ && __SSE2__ */
DistroBaker 51ea11
+#else
DistroBaker 51ea11
 #define OP_AVX_SSE3_BIT_FUNC(name, type_size, type, op) {}
DistroBaker 51ea11
 #endif  /* defined(OMPI_MCA_OP_HAVE_AVX) && (1 == OMPI_MCA_OP_HAVE_AVX) */
DistroBaker 51ea11
 
DistroBaker 51ea11
@@ -282,12 +432,13 @@ static void OP_CONCAT(ompi_op_avx_2buff_##name##_##type,PREPEND)(const void *_in
DistroBaker 51ea11
 }
DistroBaker 51ea11
 
DistroBaker 51ea11
 #if defined(GENERATE_AVX512_CODE) && defined(OMPI_MCA_OP_HAVE_AVX512) && (1 == OMPI_MCA_OP_HAVE_AVX512)
DistroBaker 51ea11
+#if __AVX512F__
DistroBaker 51ea11
 #define OP_AVX_AVX512_FLOAT_FUNC(op)                                    \
DistroBaker 51ea11
     if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_AVX512F_FLAG) ) {         \
DistroBaker 51ea11
         types_per_step = (512 / 8) / sizeof(float);                     \
DistroBaker 51ea11
         for (; left_over >= types_per_step; left_over -= types_per_step) { \
DistroBaker 51ea11
-            __m512 vecA =  _mm512_loadu_ps((__m512*)in);                \
DistroBaker 51ea11
-            __m512 vecB =  _mm512_loadu_ps((__m512*)out);         \
DistroBaker 51ea11
+            __m512 vecA = _mm512_loadu_ps((__m512*)in);                 \
DistroBaker 51ea11
+            __m512 vecB = _mm512_loadu_ps((__m512*)out);                \
DistroBaker 51ea11
             in += types_per_step;                                       \
DistroBaker 51ea11
             __m512 res = _mm512_##op##_ps(vecA, vecB);                  \
DistroBaker 51ea11
             _mm512_storeu_ps((__m512*)out, res);                        \
DistroBaker 51ea11
@@ -296,28 +447,36 @@ static void OP_CONCAT(ompi_op_avx_2buff_##name##_##type,PREPEND)(const void *_in
DistroBaker 51ea11
         if( 0 == left_over ) return;                                    \
DistroBaker 51ea11
     }
DistroBaker 51ea11
 #else
DistroBaker 51ea11
+#error Target architecture lacks AVX512F support needed for _mm512_loadu_ps and _mm512_storeu_ps
DistroBaker 51ea11
+#endif  /* __AVX512F__ */
DistroBaker 51ea11
+#else
DistroBaker 51ea11
 #define OP_AVX_AVX512_FLOAT_FUNC(op) {}
DistroBaker 51ea11
 #endif  /* defined(OMPI_MCA_OP_HAVE_AVX512) && (1 == OMPI_MCA_OP_HAVE_AVX512) */
DistroBaker 51ea11
 
DistroBaker 51ea11
 #if defined(GENERATE_AVX2_CODE) && defined(OMPI_MCA_OP_HAVE_AVX2) && (1 == OMPI_MCA_OP_HAVE_AVX2)
DistroBaker 51ea11
+#if __AVX__
DistroBaker 51ea11
 #define OP_AVX_AVX_FLOAT_FUNC(op)                                       \
DistroBaker 51ea11
     if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_AVX_FLAG) ) {             \
DistroBaker 51ea11
         types_per_step = (256 / 8) / sizeof(float);                     \
DistroBaker 51ea11
         for( ; left_over >= types_per_step; left_over -= types_per_step ) { \
DistroBaker 51ea11
-            __m256 vecA =  _mm256_loadu_ps(in);                          \
DistroBaker 51ea11
+            __m256 vecA = _mm256_loadu_ps(in);                          \
DistroBaker 51ea11
             in += types_per_step;                                       \
DistroBaker 51ea11
-            __m256 vecB =  _mm256_loadu_ps(out);                         \
DistroBaker 51ea11
+            __m256 vecB = _mm256_loadu_ps(out);                         \
DistroBaker 51ea11
             __m256 res = _mm256_##op##_ps(vecA, vecB);                  \
DistroBaker 51ea11
-            _mm256_storeu_ps(out, res);                                  \
DistroBaker 51ea11
+            _mm256_storeu_ps(out, res);                                 \
DistroBaker 51ea11
             out += types_per_step;                                      \
DistroBaker 51ea11
         }                                                               \
DistroBaker 51ea11
         if( 0 == left_over ) return;                                    \
DistroBaker 51ea11
     }
DistroBaker 51ea11
 #else
DistroBaker 51ea11
+#error Target architecture lacks AVX support needed for _mm256_loadu_ps and _mm256_storeu_ps
DistroBaker 51ea11
+#endif  /* __AVX__ */
DistroBaker 51ea11
+#else
DistroBaker 51ea11
 #define OP_AVX_AVX_FLOAT_FUNC(op) {}
DistroBaker 51ea11
 #endif  /* defined(OMPI_MCA_OP_HAVE_AVX2) && (1 == OMPI_MCA_OP_HAVE_AVX2) */
DistroBaker 51ea11
 
DistroBaker 51ea11
 #if defined(GENERATE_AVX_CODE) && defined(OMPI_MCA_OP_HAVE_AVX) && (1 == OMPI_MCA_OP_HAVE_AVX)
DistroBaker 51ea11
+#if __SSE__
DistroBaker 51ea11
 #define OP_AVX_SSE_FLOAT_FUNC(op)                                       \
DistroBaker 51ea11
     if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_SSE_FLAG) ) {             \
DistroBaker 51ea11
         types_per_step = (128 / 8) / sizeof(float);                     \
DistroBaker 51ea11
@@ -331,6 +490,9 @@ static void OP_CONCAT(ompi_op_avx_2buff_##name##_##type,PREPEND)(const void *_in
DistroBaker 51ea11
         }                                                               \
DistroBaker 51ea11
     }
DistroBaker 51ea11
 #else
DistroBaker 51ea11
+#error Target architecture lacks SSE support needed for _mm_loadu_ps and _mm_storeu_ps
DistroBaker 51ea11
+#endif  /* __SSE__ */
DistroBaker 51ea11
+#else
DistroBaker 51ea11
 #define OP_AVX_SSE_FLOAT_FUNC(op) {}
DistroBaker 51ea11
 #endif  /* defined(OMPI_MCA_OP_HAVE_AVX) && (1 == OMPI_MCA_OP_HAVE_AVX) */
DistroBaker 51ea11
 
DistroBaker 51ea11
@@ -363,13 +525,14 @@ static void OP_CONCAT(ompi_op_avx_2buff_##op##_float,PREPEND)(const void *_in, v
DistroBaker 51ea11
 }
DistroBaker 51ea11
 
DistroBaker 51ea11
 #if defined(GENERATE_AVX512_CODE) && defined(OMPI_MCA_OP_HAVE_AVX512) && (1 == OMPI_MCA_OP_HAVE_AVX512)
DistroBaker 51ea11
+#if __AVX512F__
DistroBaker 51ea11
 #define OP_AVX_AVX512_DOUBLE_FUNC(op)                                   \
DistroBaker 51ea11
     if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_AVX512F_FLAG) ) {         \
DistroBaker 51ea11
         types_per_step = (512 / 8)  / sizeof(double);                   \
DistroBaker 51ea11
         for (; left_over >= types_per_step; left_over -= types_per_step) { \
DistroBaker 51ea11
-            __m512d vecA =  _mm512_loadu_pd(in);                        \
DistroBaker 51ea11
+            __m512d vecA = _mm512_loadu_pd(in);                         \
DistroBaker 51ea11
             in += types_per_step;                                       \
DistroBaker 51ea11
-            __m512d vecB =  _mm512_loadu_pd(out);                       \
DistroBaker 51ea11
+            __m512d vecB = _mm512_loadu_pd(out);                        \
DistroBaker 51ea11
             __m512d res = _mm512_##op##_pd(vecA, vecB);                 \
DistroBaker 51ea11
             _mm512_storeu_pd((out), res);                               \
DistroBaker 51ea11
             out += types_per_step;                                      \
DistroBaker 51ea11
@@ -377,17 +540,21 @@ static void OP_CONCAT(ompi_op_avx_2buff_##op##_float,PREPEND)(const void *_in, v
DistroBaker 51ea11
         if( 0 == left_over ) return;                                    \
DistroBaker 51ea11
     }
DistroBaker 51ea11
 #else
DistroBaker 51ea11
+#error Target architecture lacks AVXF512 support needed for _mm512_loadu_pd and _mm512_storeu_pd
DistroBaker 51ea11
+#endif  /* __AVXF512__ */
DistroBaker 51ea11
+#else
DistroBaker 51ea11
 #define OP_AVX_AVX512_DOUBLE_FUNC(op) {}
DistroBaker 51ea11
 #endif  /* defined(OMPI_MCA_OP_HAVE_AVX512) && (1 == OMPI_MCA_OP_HAVE_AVX512) */
DistroBaker 51ea11
 
DistroBaker 51ea11
 #if defined(GENERATE_AVX2_CODE) && defined(OMPI_MCA_OP_HAVE_AVX2) && (1 == OMPI_MCA_OP_HAVE_AVX2)
DistroBaker 51ea11
+#if __AVX__
DistroBaker 51ea11
 #define OP_AVX_AVX_DOUBLE_FUNC(op)                                      \
DistroBaker 51ea11
     if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_AVX_FLAG) ) {             \
DistroBaker 51ea11
         types_per_step = (256 / 8)  / sizeof(double);                   \
DistroBaker 51ea11
         for( ; left_over >= types_per_step; left_over -= types_per_step ) { \
DistroBaker 51ea11
-            __m256d vecA =  _mm256_loadu_pd(in);                        \
DistroBaker 51ea11
+            __m256d vecA = _mm256_loadu_pd(in);                         \
DistroBaker 51ea11
             in += types_per_step;                                       \
DistroBaker 51ea11
-            __m256d vecB =  _mm256_loadu_pd(out);                       \
DistroBaker 51ea11
+            __m256d vecB = _mm256_loadu_pd(out);                        \
DistroBaker 51ea11
             __m256d res = _mm256_##op##_pd(vecA, vecB);                 \
DistroBaker 51ea11
             _mm256_storeu_pd(out, res);                                 \
DistroBaker 51ea11
             out += types_per_step;                                      \
DistroBaker 51ea11
@@ -395,10 +562,14 @@ static void OP_CONCAT(ompi_op_avx_2buff_##op##_float,PREPEND)(const void *_in, v
DistroBaker 51ea11
         if( 0 == left_over ) return;                                    \
DistroBaker 51ea11
       }
DistroBaker 51ea11
 #else
DistroBaker 51ea11
+#error Target architecture lacks AVX support needed for _mm256_loadu_pd and _mm256_storeu_pd
DistroBaker 51ea11
+#endif  /* __AVX__ */
DistroBaker 51ea11
+#else
DistroBaker 51ea11
 #define OP_AVX_AVX_DOUBLE_FUNC(op) {}
DistroBaker 51ea11
 #endif  /* defined(OMPI_MCA_OP_HAVE_AVX2) && (1 == OMPI_MCA_OP_HAVE_AVX2) */
DistroBaker 51ea11
 
DistroBaker 51ea11
 #if defined(GENERATE_AVX_CODE) && defined(OMPI_MCA_OP_HAVE_AVX) && (1 == OMPI_MCA_OP_HAVE_AVX)
DistroBaker 51ea11
+#if __SSE2__
DistroBaker 51ea11
 #define OP_AVX_SSE2_DOUBLE_FUNC(op)                                     \
DistroBaker 51ea11
     if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_SSE2_FLAG) ) {            \
DistroBaker 51ea11
         types_per_step = (128 / 8)  / sizeof(double);                   \
DistroBaker 51ea11
@@ -412,6 +583,9 @@ static void OP_CONCAT(ompi_op_avx_2buff_##op##_float,PREPEND)(const void *_in, v
DistroBaker 51ea11
         }                                                               \
DistroBaker 51ea11
     }
DistroBaker 51ea11
 #else
DistroBaker 51ea11
+#error Target architecture lacks SSE2 support needed for _mm_loadu_pd and _mm_storeu_pd
DistroBaker 51ea11
+#endif  /* __SSE2__ */
DistroBaker 51ea11
+#else
DistroBaker 51ea11
 #define OP_AVX_SSE2_DOUBLE_FUNC(op) {}
DistroBaker 51ea11
 #endif  /* defined(OMPI_MCA_OP_HAVE_AVX) && (1 == OMPI_MCA_OP_HAVE_AVX) */
DistroBaker 51ea11
 
DistroBaker 51ea11
@@ -580,12 +754,13 @@ static void OP_CONCAT(ompi_op_avx_2buff_##op##_double,PREPEND)(const void *_in,
DistroBaker 51ea11
  *  routines, needed for some optimizations.
DistroBaker 51ea11
  */
DistroBaker 51ea11
 #if defined(GENERATE_AVX512_CODE) && defined(OMPI_MCA_OP_HAVE_AVX512) && (1 == OMPI_MCA_OP_HAVE_AVX512)
DistroBaker 51ea11
+#if __AVX512F__
DistroBaker 51ea11
 #define OP_AVX_AVX512_FUNC_3(name, type_sign, type_size, type, op)      \
DistroBaker 51ea11
     if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_AVX512F_FLAG|OMPI_OP_AVX_HAS_AVX512BW_FLAG) ) {   \
DistroBaker 51ea11
         int types_per_step = (512 / 8) / sizeof(type);                  \
DistroBaker 51ea11
         for (; left_over >= types_per_step; left_over -= types_per_step) { \
DistroBaker 51ea11
-            __m512i vecA =  _mm512_loadu_si512(in1);                    \
DistroBaker 51ea11
-            __m512i vecB =  _mm512_loadu_si512(in2);                    \
DistroBaker 51ea11
+            __m512i vecA = _mm512_loadu_si512(in1);                     \
DistroBaker 51ea11
+            __m512i vecB = _mm512_loadu_si512(in2);                     \
DistroBaker 51ea11
             in1 += types_per_step;                                      \
DistroBaker 51ea11
             in2 += types_per_step;                                      \
DistroBaker 51ea11
             __m512i res = _mm512_##op##_ep##type_sign##type_size(vecA, vecB); \
DistroBaker 51ea11
@@ -595,10 +770,14 @@ static void OP_CONCAT(ompi_op_avx_2buff_##op##_double,PREPEND)(const void *_in,
DistroBaker 51ea11
         if( 0 == left_over ) return;                                    \
DistroBaker 51ea11
     }
DistroBaker 51ea11
 #else
DistroBaker 51ea11
+#error Target architecture lacks AVX512F support needed for _mm512_loadu_si512 and _mm512_storeu_si512
DistroBaker 51ea11
+#endif  /* __AVX512F__ */
DistroBaker 51ea11
+#else
DistroBaker 51ea11
 #define OP_AVX_AVX512_FUNC_3(name, type_sign, type_size, type, op) {}
DistroBaker 51ea11
 #endif  /* defined(OMPI_MCA_OP_HAVE_AVX512) && (1 == OMPI_MCA_OP_HAVE_AVX512) */
DistroBaker 51ea11
 
DistroBaker 51ea11
 #if defined(GENERATE_AVX2_CODE) && defined(OMPI_MCA_OP_HAVE_AVX2) && (1 == OMPI_MCA_OP_HAVE_AVX2)
DistroBaker 51ea11
+#if __AVX__
DistroBaker 51ea11
 #define OP_AVX_AVX2_FUNC_3(name, type_sign, type_size, type, op)        \
DistroBaker 51ea11
     if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_AVX2_FLAG | OMPI_OP_AVX_HAS_AVX_FLAG) ) { \
DistroBaker 51ea11
         int types_per_step = (256 / 8) / sizeof(type);                  \
DistroBaker 51ea11
@@ -607,17 +786,21 @@ static void OP_CONCAT(ompi_op_avx_2buff_##op##_double,PREPEND)(const void *_in,
DistroBaker 51ea11
             __m256i vecB = _mm256_loadu_si256((__m256i*)in2);           \
DistroBaker 51ea11
             in1 += types_per_step;                                      \
DistroBaker 51ea11
             in2 += types_per_step;                                      \
DistroBaker 51ea11
-            __m256i res =  _mm256_##op##_ep##type_sign##type_size(vecA, vecB); \
DistroBaker 51ea11
+            __m256i res = _mm256_##op##_ep##type_sign##type_size(vecA, vecB); \
DistroBaker 51ea11
             _mm256_storeu_si256((__m256i*)out, res);                    \
DistroBaker 51ea11
             out += types_per_step;                                      \
DistroBaker 51ea11
         }                                                               \
DistroBaker 51ea11
         if( 0 == left_over ) return;                                    \
DistroBaker 51ea11
     }
DistroBaker 51ea11
 #else
DistroBaker 51ea11
+#error Target architecture lacks AVX support needed for _mm256_loadu_si256 and _mm256_storeu_si256
DistroBaker 51ea11
+#endif  /* __AVX__ */
DistroBaker 51ea11
+#else
DistroBaker 51ea11
 #define OP_AVX_AVX2_FUNC_3(name, type_sign, type_size, type, op) {}
DistroBaker 51ea11
 #endif  /* defined(OMPI_MCA_OP_HAVE_AVX2) && (1 == OMPI_MCA_OP_HAVE_AVX2) */
DistroBaker 51ea11
 
DistroBaker 51ea11
 #if defined(GENERATE_SSE3_CODE) && defined(OMPI_MCA_OP_HAVE_SSE41) && (1 == OMPI_MCA_OP_HAVE_SSE41) && defined(OMPI_MCA_OP_HAVE_AVX) && (1 == OMPI_MCA_OP_HAVE_AVX)
DistroBaker 51ea11
+#if __SSE3__ && __SSE2__
DistroBaker 51ea11
 #define OP_AVX_SSE4_1_FUNC_3(name, type_sign, type_size, type, op)      \
DistroBaker 51ea11
     if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_SSE3_FLAG | OMPI_OP_AVX_HAS_SSE4_1_FLAG) ) {       \
DistroBaker 51ea11
         int types_per_step = (128 / 8) / sizeof(type);                  \
DistroBaker 51ea11
@@ -626,12 +809,15 @@ static void OP_CONCAT(ompi_op_avx_2buff_##op##_double,PREPEND)(const void *_in,
DistroBaker 51ea11
             __m128i vecB = _mm_lddqu_si128((__m128i*)in2);              \
DistroBaker 51ea11
             in1 += types_per_step;                                      \
DistroBaker 51ea11
             in2 += types_per_step;                                      \
DistroBaker 51ea11
-            __m128i res =  _mm_##op##_ep##type_sign##type_size(vecA, vecB); \
DistroBaker 51ea11
+            __m128i res = _mm_##op##_ep##type_sign##type_size(vecA, vecB); \
DistroBaker 51ea11
             _mm_storeu_si128((__m128i*)out, res);                       \
DistroBaker 51ea11
             out += types_per_step;                                      \
DistroBaker 51ea11
         }                                                               \
DistroBaker 51ea11
     }
DistroBaker 51ea11
 #else
DistroBaker 51ea11
+#error Target architecture lacks SSE2 and SSE3 support needed for _mm_lddqu_si128 and _mm_storeu_si128
DistroBaker 51ea11
+#endif  /* __SSE3__ && __SSE2__ */
DistroBaker 51ea11
+#else
DistroBaker 51ea11
 #define OP_AVX_SSE4_1_FUNC_3(name, type_sign, type_size, type, op) {}
DistroBaker 51ea11
 #endif  /* defined(OMPI_MCA_OP_HAVE_AVX) && (1 == OMPI_MCA_OP_HAVE_AVX) */
DistroBaker 51ea11
 
DistroBaker 51ea11
@@ -667,12 +853,13 @@ static void OP_CONCAT(ompi_op_avx_3buff_##name##_##type,PREPEND)(const void * re
DistroBaker 51ea11
 }
DistroBaker 51ea11
 
DistroBaker 51ea11
 #if defined(GENERATE_AVX512_CODE) && defined(OMPI_MCA_OP_HAVE_AVX512) && (1 == OMPI_MCA_OP_HAVE_AVX512)
DistroBaker 51ea11
+#if __AVX512BW__ && __AVX__
DistroBaker 51ea11
 #define OP_AVX_AVX512_MUL_3(name, type_sign, type_size, type, op)       \
DistroBaker 51ea11
     if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_AVX512F_FLAG | OMPI_OP_AVX_HAS_AVX512BW_FLAG) ) { \
DistroBaker 51ea11
         int types_per_step = (256 / 8) / sizeof(type);                  \
DistroBaker 51ea11
         for (; left_over >= types_per_step; left_over -= types_per_step) { \
DistroBaker 51ea11
-            __m256i vecA_tmp =  _mm256_loadu_si256((__m256i*)in1);      \
DistroBaker 51ea11
-            __m256i vecB_tmp =  _mm256_loadu_si256((__m256i*)in2);      \
DistroBaker 51ea11
+            __m256i vecA_tmp = _mm256_loadu_si256((__m256i*)in1);       \
DistroBaker 51ea11
+            __m256i vecB_tmp = _mm256_loadu_si256((__m256i*)in2);       \
DistroBaker 51ea11
             in1 += types_per_step;                                      \
DistroBaker 51ea11
             in2 += types_per_step;                                      \
DistroBaker 51ea11
             __m512i vecA = _mm512_cvtepi8_epi16(vecA_tmp);              \
DistroBaker 51ea11
@@ -685,6 +872,9 @@ static void OP_CONCAT(ompi_op_avx_3buff_##name##_##type,PREPEND)(const void * re
DistroBaker 51ea11
         if( 0 == left_over ) return;                                    \
DistroBaker 51ea11
   }
DistroBaker 51ea11
 #else
DistroBaker 51ea11
+#error Target architecture lacks AVX512BW and AVX support needed for _mm256_loadu_si256, _mm256_storeu_si256 and _mm512_cvtepi8_epi16
DistroBaker 51ea11
+#endif  /* __AVX512BW__ && __AVX__ */
DistroBaker 51ea11
+#else
DistroBaker 51ea11
 #define OP_AVX_AVX512_MUL_3(name, type_sign, type_size, type, op) {}
DistroBaker 51ea11
 #endif  /* defined(OMPI_MCA_OP_HAVE_AVX512) && (1 == OMPI_MCA_OP_HAVE_AVX512) */
DistroBaker 51ea11
 /**
DistroBaker 51ea11
@@ -723,12 +913,13 @@ static void OP_CONCAT(ompi_op_avx_3buff_##name##_##type,PREPEND)(const void * re
DistroBaker 51ea11
 }
DistroBaker 51ea11
 
DistroBaker 51ea11
 #if defined(GENERATE_AVX512_CODE) && defined(OMPI_MCA_OP_HAVE_AVX512) && (1 == OMPI_MCA_OP_HAVE_AVX512)
DistroBaker 51ea11
+#if __AVX512F__
DistroBaker 51ea11
 #define OP_AVX_AVX512_BIT_FUNC_3(name, type_size, type, op)             \
DistroBaker 51ea11
     if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_AVX512F_FLAG) ) {         \
DistroBaker 51ea11
         types_per_step = (512 / 8) / sizeof(type);                      \
DistroBaker 51ea11
         for (; left_over >= types_per_step; left_over -= types_per_step) {  \
DistroBaker 51ea11
-            __m512i vecA =  _mm512_loadu_si512(in1);                    \
DistroBaker 51ea11
-            __m512i vecB =  _mm512_loadu_si512(in2);                    \
DistroBaker 51ea11
+            __m512i vecA = _mm512_loadu_si512(in1);                     \
DistroBaker 51ea11
+            __m512i vecB = _mm512_loadu_si512(in2);                     \
DistroBaker 51ea11
             in1 += types_per_step;                                      \
DistroBaker 51ea11
             in2 += types_per_step;                                      \
DistroBaker 51ea11
             __m512i res = _mm512_##op##_si512(vecA, vecB);              \
DistroBaker 51ea11
@@ -738,10 +929,14 @@ static void OP_CONCAT(ompi_op_avx_3buff_##name##_##type,PREPEND)(const void * re
DistroBaker 51ea11
         if( 0 == left_over ) return;                                    \
DistroBaker 51ea11
     }
DistroBaker 51ea11
 #else
DistroBaker 51ea11
+#error Target architecture lacks AVX512F support needed for _mm512_loadu_si512 and _mm512_storeu_si512
DistroBaker 51ea11
+#endif  /* __AVX512F__ */
DistroBaker 51ea11
+#else
DistroBaker 51ea11
 #define OP_AVX_AVX512_BIT_FUNC_3(name, type_size, type, op) {}
DistroBaker 51ea11
 #endif  /* defined(OMPI_MCA_OP_HAVE_AVX512) && (1 == OMPI_MCA_OP_HAVE_AVX512) */
DistroBaker 51ea11
 
DistroBaker 51ea11
 #if defined(GENERATE_AVX2_CODE) && defined(OMPI_MCA_OP_HAVE_AVX2) && (1 == OMPI_MCA_OP_HAVE_AVX2)
DistroBaker 51ea11
+#if __AVX__
DistroBaker 51ea11
 #define OP_AVX_AVX2_BIT_FUNC_3(name, type_size, type, op)               \
DistroBaker 51ea11
     if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_AVX2_FLAG | OMPI_OP_AVX_HAS_AVX_FLAG) ) { \
DistroBaker 51ea11
         types_per_step = (256 / 8) / sizeof(type);                      \
DistroBaker 51ea11
@@ -750,17 +945,21 @@ static void OP_CONCAT(ompi_op_avx_3buff_##name##_##type,PREPEND)(const void * re
DistroBaker 51ea11
             __m256i vecB = _mm256_loadu_si256((__m256i*)in2);           \
DistroBaker 51ea11
             in1 += types_per_step;                                      \
DistroBaker 51ea11
             in2 += types_per_step;                                      \
DistroBaker 51ea11
-            __m256i res =  _mm256_##op##_si256(vecA, vecB);             \
DistroBaker 51ea11
+            __m256i res = _mm256_##op##_si256(vecA, vecB);              \
DistroBaker 51ea11
             _mm256_storeu_si256((__m256i*)out, res);                    \
DistroBaker 51ea11
             out += types_per_step;                                      \
DistroBaker 51ea11
         }                                                               \
DistroBaker 51ea11
         if( 0 == left_over ) return;                                    \
DistroBaker 51ea11
     }
DistroBaker 51ea11
 #else
DistroBaker 51ea11
+#error Target architecture lacks AVX support needed for _mm256_loadu_si256 and _mm256_storeu_si256
DistroBaker 51ea11
+#endif  /* __AVX__ */
DistroBaker 51ea11
+#else
DistroBaker 51ea11
 #define OP_AVX_AVX2_BIT_FUNC_3(name, type_size, type, op) {}
DistroBaker 51ea11
 #endif  /* defined(OMPI_MCA_OP_HAVE_AVX2) && (1 == OMPI_MCA_OP_HAVE_AVX2) */
DistroBaker 51ea11
 
DistroBaker 51ea11
 #if defined(GENERATE_SSE3_CODE) && defined(OMPI_MCA_OP_HAVE_AVX) && (1 == OMPI_MCA_OP_HAVE_AVX)
DistroBaker 51ea11
+#if __SSE3__ && __SSE2__
DistroBaker 51ea11
 #define OP_AVX_SSE3_BIT_FUNC_3(name, type_size, type, op)               \
DistroBaker 51ea11
     if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_SSE3_FLAG) ) {            \
DistroBaker 51ea11
         types_per_step = (128 / 8) / sizeof(type);                      \
DistroBaker 51ea11
@@ -769,12 +968,15 @@ static void OP_CONCAT(ompi_op_avx_3buff_##name##_##type,PREPEND)(const void * re
DistroBaker 51ea11
             __m128i vecB = _mm_lddqu_si128((__m128i*)in2);              \
DistroBaker 51ea11
             in1 += types_per_step;                                      \
DistroBaker 51ea11
             in2 += types_per_step;                                      \
DistroBaker 51ea11
-            __m128i res =  _mm_##op##_si128(vecA, vecB);                \
DistroBaker 51ea11
+            __m128i res = _mm_##op##_si128(vecA, vecB);                 \
DistroBaker 51ea11
             _mm_storeu_si128((__m128i*)out, res);                       \
DistroBaker 51ea11
             out += types_per_step;                                      \
DistroBaker 51ea11
         }                                                               \
DistroBaker 51ea11
     }
DistroBaker 51ea11
 #else
DistroBaker 51ea11
+#error Target architecture lacks SSE2 and SSE3 support needed for _mm_lddqu_si128 and _mm_storeu_si128
DistroBaker 51ea11
+#endif  /* __SSE3__ && __SSE2__ */
DistroBaker 51ea11
+#else
DistroBaker 51ea11
 #define OP_AVX_SSE3_BIT_FUNC_3(name, type_size, type, op) {}
DistroBaker 51ea11
 #endif  /* defined(OMPI_MCA_OP_HAVE_AVX) && (1 == OMPI_MCA_OP_HAVE_AVX) */
DistroBaker 51ea11
 
DistroBaker 51ea11
@@ -809,12 +1011,13 @@ static void OP_CONCAT(ompi_op_avx_3buff_##op##_##type,PREPEND)(const void *_in1,
DistroBaker 51ea11
 }
DistroBaker 51ea11
 
DistroBaker 51ea11
 #if defined(GENERATE_AVX512_CODE) && defined(OMPI_MCA_OP_HAVE_AVX512) && (1 == OMPI_MCA_OP_HAVE_AVX512)
DistroBaker 51ea11
+#if __AVX512F__
DistroBaker 51ea11
 #define OP_AVX_AVX512_FLOAT_FUNC_3(op)                                  \
DistroBaker 51ea11
     if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_AVX512F_FLAG) ) {         \
DistroBaker 51ea11
         types_per_step = (512 / 8) / sizeof(float);                     \
DistroBaker 51ea11
         for (; left_over >= types_per_step; left_over -= types_per_step) { \
DistroBaker 51ea11
-            __m512 vecA =  _mm512_loadu_ps(in1);                        \
DistroBaker 51ea11
-            __m512 vecB =  _mm512_loadu_ps(in2);                        \
DistroBaker 51ea11
+            __m512 vecA = _mm512_loadu_ps(in1);                         \
DistroBaker 51ea11
+            __m512 vecB = _mm512_loadu_ps(in2);                         \
DistroBaker 51ea11
             in1 += types_per_step;                                      \
DistroBaker 51ea11
             in2 += types_per_step;                                      \
DistroBaker 51ea11
             __m512 res = _mm512_##op##_ps(vecA, vecB);                  \
DistroBaker 51ea11
@@ -824,16 +1027,20 @@ static void OP_CONCAT(ompi_op_avx_3buff_##op##_##type,PREPEND)(const void *_in1,
DistroBaker 51ea11
         if( 0 == left_over ) return;                                    \
DistroBaker 51ea11
     }
DistroBaker 51ea11
 #else
DistroBaker 51ea11
+#error Target architecture lacks AVX512F support needed for _mm512_loadu_ps and _mm512_storeu_ps
DistroBaker 51ea11
+#endif  /* __AVX512F__ */
DistroBaker 51ea11
+#else
DistroBaker 51ea11
 #define OP_AVX_AVX512_FLOAT_FUNC_3(op) {}
DistroBaker 51ea11
 #endif  /* defined(OMPI_MCA_OP_HAVE_AVX512) && (1 == OMPI_MCA_OP_HAVE_AVX512) */
DistroBaker 51ea11
 
DistroBaker 51ea11
 #if defined(GENERATE_AVX2_CODE) && defined(OMPI_MCA_OP_HAVE_AVX2) && (1 == OMPI_MCA_OP_HAVE_AVX2)
DistroBaker 51ea11
+#if __AVX__
DistroBaker 51ea11
 #define OP_AVX_AVX_FLOAT_FUNC_3(op)                                     \
DistroBaker 51ea11
     if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_AVX_FLAG) ) {             \
DistroBaker 51ea11
         types_per_step = (256 / 8) / sizeof(float);                     \
DistroBaker 51ea11
         for( ; left_over >= types_per_step; left_over -= types_per_step ) { \
DistroBaker 51ea11
-            __m256 vecA =  _mm256_loadu_ps(in1);                        \
DistroBaker 51ea11
-            __m256 vecB =  _mm256_loadu_ps(in2);                        \
DistroBaker 51ea11
+            __m256 vecA = _mm256_loadu_ps(in1);                         \
DistroBaker 51ea11
+            __m256 vecB = _mm256_loadu_ps(in2);                         \
DistroBaker 51ea11
             in1 += types_per_step;                                      \
DistroBaker 51ea11
             in2 += types_per_step;                                      \
DistroBaker 51ea11
             __m256 res = _mm256_##op##_ps(vecA, vecB);                  \
DistroBaker 51ea11
@@ -843,10 +1050,14 @@ static void OP_CONCAT(ompi_op_avx_3buff_##op##_##type,PREPEND)(const void *_in1,
DistroBaker 51ea11
         if( 0 == left_over ) return;                                    \
DistroBaker 51ea11
     }
DistroBaker 51ea11
 #else
DistroBaker 51ea11
+#error Target architecture lacks AVX support needed for _mm256_loadu_ps and _mm256_storeu_ps
DistroBaker 51ea11
+#endif  /* __AVX__ */
DistroBaker 51ea11
+#else
DistroBaker 51ea11
 #define OP_AVX_AVX_FLOAT_FUNC_3(op) {}
DistroBaker 51ea11
 #endif  /* defined(OMPI_MCA_OP_HAVE_AVX2) && (1 == OMPI_MCA_OP_HAVE_AVX2) */
DistroBaker 51ea11
 
DistroBaker 51ea11
 #if defined(GENERATE_AVX_CODE) && defined(OMPI_MCA_OP_HAVE_AVX) && (1 == OMPI_MCA_OP_HAVE_AVX)
DistroBaker 51ea11
+#if __SSE__
DistroBaker 51ea11
 #define OP_AVX_SSE_FLOAT_FUNC_3(op)                  \
DistroBaker 51ea11
     if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_SSE_FLAG) ) {             \
DistroBaker 51ea11
         types_per_step = (128 / 8) / sizeof(float);                     \
DistroBaker 51ea11
@@ -861,6 +1072,9 @@ static void OP_CONCAT(ompi_op_avx_3buff_##op##_##type,PREPEND)(const void *_in1,
DistroBaker 51ea11
         }                                                               \
DistroBaker 51ea11
     }
DistroBaker 51ea11
 #else
DistroBaker 51ea11
+#error Target architecture lacks SSE support needed for _mm_loadu_ps and _mm_storeu_ps
DistroBaker 51ea11
+#endif  /* __SSE__ */
DistroBaker 51ea11
+#else
DistroBaker 51ea11
 #define OP_AVX_SSE_FLOAT_FUNC_3(op) {}
DistroBaker 51ea11
 #endif  /* defined(OMPI_MCA_OP_HAVE_AVX) && (1 == OMPI_MCA_OP_HAVE_AVX) */
DistroBaker 51ea11
 
DistroBaker 51ea11
@@ -895,12 +1109,13 @@ static void OP_CONCAT(ompi_op_avx_3buff_##op##_float,PREPEND)(const void *_in1,
DistroBaker 51ea11
 }
DistroBaker 51ea11
 
DistroBaker 51ea11
 #if defined(GENERATE_AVX512_CODE) && defined(OMPI_MCA_OP_HAVE_AVX512) && (1 == OMPI_MCA_OP_HAVE_AVX512)
DistroBaker 51ea11
+#if __AVX512F__
DistroBaker 51ea11
 #define OP_AVX_AVX512_DOUBLE_FUNC_3(op)                                 \
DistroBaker 51ea11
     if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_AVX512F_FLAG) ) {         \
DistroBaker 51ea11
         types_per_step = (512 / 8) / sizeof(double);                    \
DistroBaker 51ea11
         for (; left_over >= types_per_step; left_over -= types_per_step) { \
DistroBaker 51ea11
-            __m512d vecA =  _mm512_loadu_pd((in1));                     \
DistroBaker 51ea11
-            __m512d vecB =  _mm512_loadu_pd((in2));                     \
DistroBaker 51ea11
+            __m512d vecA = _mm512_loadu_pd((in1));                      \
DistroBaker 51ea11
+            __m512d vecB = _mm512_loadu_pd((in2));                      \
DistroBaker 51ea11
             in1 += types_per_step;                                      \
DistroBaker 51ea11
             in2 += types_per_step;                                      \
DistroBaker 51ea11
             __m512d res = _mm512_##op##_pd(vecA, vecB);                 \
DistroBaker 51ea11
@@ -910,16 +1125,20 @@ static void OP_CONCAT(ompi_op_avx_3buff_##op##_float,PREPEND)(const void *_in1,
DistroBaker 51ea11
         if( 0 == left_over ) return;                                    \
DistroBaker 51ea11
     }
DistroBaker 51ea11
 #else
DistroBaker 51ea11
+#error Target architecture lacks AVXF512 support needed for _mm512_loadu_pd and _mm512_storeu_pd
DistroBaker 51ea11
+#endif  /* __AVXF512__ */
DistroBaker 51ea11
+#else
DistroBaker 51ea11
 #define OP_AVX_AVX512_DOUBLE_FUNC_3(op) {}
DistroBaker 51ea11
 #endif  /* defined(OMPI_MCA_OP_HAVE_AVX512) && (1 == OMPI_MCA_OP_HAVE_AVX512) */
DistroBaker 51ea11
 
DistroBaker 51ea11
 #if defined(GENERATE_AVX2_CODE) && defined(OMPI_MCA_OP_HAVE_AVX2) && (1 == OMPI_MCA_OP_HAVE_AVX2)
DistroBaker 51ea11
+#if __AVX__
DistroBaker 51ea11
 #define OP_AVX_AVX_DOUBLE_FUNC_3(op)                                    \
DistroBaker 51ea11
     if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_AVX_FLAG) ) {             \
DistroBaker 51ea11
         types_per_step = (256 / 8) / sizeof(double);                    \
DistroBaker 51ea11
         for( ; left_over >= types_per_step; left_over -= types_per_step ) { \
DistroBaker 51ea11
-            __m256d vecA =  _mm256_loadu_pd(in1);                       \
DistroBaker 51ea11
-            __m256d vecB =  _mm256_loadu_pd(in2);                       \
DistroBaker 51ea11
+            __m256d vecA = _mm256_loadu_pd(in1);                        \
DistroBaker 51ea11
+            __m256d vecB = _mm256_loadu_pd(in2);                        \
DistroBaker 51ea11
             in1 += types_per_step;                                      \
DistroBaker 51ea11
             in2 += types_per_step;                                      \
DistroBaker 51ea11
             __m256d res = _mm256_##op##_pd(vecA, vecB);                 \
DistroBaker 51ea11
@@ -929,10 +1148,14 @@ static void OP_CONCAT(ompi_op_avx_3buff_##op##_float,PREPEND)(const void *_in1,
DistroBaker 51ea11
         if( 0 == left_over ) return;                                    \
DistroBaker 51ea11
     }
DistroBaker 51ea11
 #else
DistroBaker 51ea11
+#error Target architecture lacks AVX support needed for _mm256_loadu_pd and _mm256_storeu_pd
DistroBaker 51ea11
+#endif  /* __AVX__ */
DistroBaker 51ea11
+#else
DistroBaker 51ea11
 #define OP_AVX_AVX_DOUBLE_FUNC_3(op) {}
DistroBaker 51ea11
 #endif  /* defined(OMPI_MCA_OP_HAVE_AVX2) && (1 == OMPI_MCA_OP_HAVE_AVX2) */
DistroBaker 51ea11
 
DistroBaker 51ea11
 #if defined(GENERATE_AVX_CODE) && defined(OMPI_MCA_OP_HAVE_AVX) && (1 == OMPI_MCA_OP_HAVE_AVX)
DistroBaker 51ea11
+#if __SSE2__
DistroBaker 51ea11
 #define OP_AVX_SSE2_DOUBLE_FUNC_3(op)                                   \
DistroBaker 51ea11
     if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_SSE2_FLAG) ) {            \
DistroBaker 51ea11
         types_per_step = (128 / 8) / sizeof(double);                    \
DistroBaker 51ea11
@@ -947,6 +1170,9 @@ static void OP_CONCAT(ompi_op_avx_3buff_##op##_float,PREPEND)(const void *_in1,
DistroBaker 51ea11
         }                                                               \
DistroBaker 51ea11
     }
DistroBaker 51ea11
 #else
DistroBaker 51ea11
+#error Target architecture lacks SSE2 support needed for _mm_loadu_pd and _mm_storeu_pd
DistroBaker 51ea11
+#endif  /* __SSE2__ */
DistroBaker 51ea11
+#else
DistroBaker 51ea11
 #define OP_AVX_SSE2_DOUBLE_FUNC_3(op) {}
DistroBaker 51ea11
 #endif  /* defined(OMPI_MCA_OP_HAVE_AVX) && (1 == OMPI_MCA_OP_HAVE_AVX) */
DistroBaker 51ea11
 
DistroBaker 51ea11
DistroBaker 51ea11
From 20be3fc25713ac2de3eb4d77b85248d7fe2bc28b Mon Sep 17 00:00:00 2001
DistroBaker 51ea11
From: George Bosilca <bosilca@icl.utk.edu>
DistroBaker 51ea11
Date: Tue, 5 Jan 2021 22:40:26 -0500
DistroBaker 51ea11
Subject: [PATCH 3/3] A better test for MPI_OP performance.
DistroBaker 51ea11
DistroBaker 51ea11
The test now has the ability to add a shift to all or to any of the
DistroBaker 51ea11
input and output buffers to assess the impact of unaligned operations.
DistroBaker 51ea11
DistroBaker 51ea11
Signed-off-by: George Bosilca <bosilca@icl.utk.edu>
DistroBaker 51ea11
---
DistroBaker 51ea11
 test/datatype/reduce_local.c | 161 ++++++++++++++++++++++-------------
DistroBaker 51ea11
 1 file changed, 104 insertions(+), 57 deletions(-)
DistroBaker 51ea11
DistroBaker 51ea11
diff --git a/test/datatype/reduce_local.c b/test/datatype/reduce_local.c
DistroBaker 51ea11
index 97890f94227..f227439b714 100644
DistroBaker 51ea11
--- a/test/datatype/reduce_local.c
DistroBaker 51ea11
+++ b/test/datatype/reduce_local.c
DistroBaker 51ea11
@@ -59,7 +59,7 @@ static int total_errors = 0;
DistroBaker 51ea11
      _a < _b ? _a : _b; })
DistroBaker 51ea11
 
DistroBaker 51ea11
 static void print_status(char* op, char* type, int type_size,
DistroBaker 51ea11
-                         int count, double duration,
DistroBaker 51ea11
+                         int count, int max_shift, double *duration, int repeats,
DistroBaker 51ea11
                          int correct )
DistroBaker 51ea11
 {
DistroBaker 51ea11
     if(correct) {
DistroBaker 51ea11
@@ -68,7 +68,15 @@ static void print_status(char* op, char* type, int type_size,
DistroBaker 51ea11
         printf("%-10s %s [\033[1;31mfail\033[0m]", op, type);
DistroBaker 51ea11
         total_errors++;
DistroBaker 51ea11
     }
DistroBaker 51ea11
-    printf(" count  %-10d  time %.6f seconds\n", count, duration);
DistroBaker 51ea11
+    if( 1 == max_shift ) {
DistroBaker 51ea11
+        printf(" count  %-10d  time (seconds) %.8f seconds\n", count, duration[0] / repeats);
DistroBaker 51ea11
+    } else {
DistroBaker 51ea11
+        printf(" count  %-10d  time (seconds / shifts) ", count);
DistroBaker 51ea11
+        for( int i = 0; i < max_shift; i++ ) {
DistroBaker 51ea11
+            printf("%.8f ", duration[i] / repeats );
DistroBaker 51ea11
+        }
DistroBaker 51ea11
+        printf("\n");
DistroBaker 51ea11
+    }
DistroBaker 51ea11
 }
DistroBaker 51ea11
 
DistroBaker 51ea11
 static int do_ops_built = 0;
DistroBaker 51ea11
@@ -115,19 +123,23 @@ do { \
DistroBaker 51ea11
     const TYPE *_p1 = ((TYPE*)(INBUF)), *_p3 = ((TYPE*)(CHECK_BUF)); \
DistroBaker 51ea11
     TYPE *_p2 = ((TYPE*)(INOUT_BUF)); \
DistroBaker 51ea11
     skip_op_type = 0; \
DistroBaker 51ea11
-    for(int _k = 0; _k < min((COUNT), 4); +_k++ ) { \
DistroBaker 51ea11
-        memcpy(_p2, _p3, sizeof(TYPE) * (COUNT)); \
DistroBaker 51ea11
-        tstart = MPI_Wtime(); \
DistroBaker 51ea11
-        MPI_Reduce_local(_p1+_k, _p2+_k, (COUNT)-_k, (MPITYPE), (MPIOP)); \
DistroBaker 51ea11
-        tend = MPI_Wtime(); \
DistroBaker 51ea11
-        if( check ) { \
DistroBaker 51ea11
-            for( i = 0; i < (COUNT)-_k; i++ ) { \
DistroBaker 51ea11
-                if(((_p2+_k)[i]) == (((_p1+_k)[i]) OPNAME ((_p3+_k)[i]))) \
DistroBaker 51ea11
-                    continue; \
DistroBaker 51ea11
-                printf("First error at alignment %d position %d (%" TYPE_PREFIX " %s %" TYPE_PREFIX " != %" TYPE_PREFIX ")\n", \
DistroBaker 51ea11
-                       _k, i, (_p1+_k)[i], (#OPNAME), (_p3+_k)[i], (_p2+_k)[i]); \
DistroBaker 51ea11
-                correctness = 0; \
DistroBaker 51ea11
-                break; \
DistroBaker 51ea11
+    for(int _k = 0; _k < min((COUNT), max_shift); +_k++ ) { \
DistroBaker 51ea11
+        duration[_k] = 0.0; \
DistroBaker 51ea11
+        for(int _r = repeats; _r > 0; _r--) { \
DistroBaker 51ea11
+            memcpy(_p2, _p3, sizeof(TYPE) * (COUNT)); \
DistroBaker 51ea11
+            tstart = MPI_Wtime(); \
DistroBaker 51ea11
+            MPI_Reduce_local(_p1+_k, _p2+_k, (COUNT)-_k, (MPITYPE), (MPIOP)); \
DistroBaker 51ea11
+            tend = MPI_Wtime(); \
DistroBaker 51ea11
+            duration[_k] += (tend - tstart); \
DistroBaker 51ea11
+            if( check ) { \
DistroBaker 51ea11
+                for( i = 0; i < (COUNT)-_k; i++ ) { \
DistroBaker 51ea11
+                    if(((_p2+_k)[i]) == (((_p1+_k)[i]) OPNAME ((_p3+_k)[i]))) \
DistroBaker 51ea11
+                        continue; \
DistroBaker 51ea11
+                    printf("First error at alignment %d position %d (%" TYPE_PREFIX " %s %" TYPE_PREFIX " != %" TYPE_PREFIX ")\n", \
DistroBaker 51ea11
+                           _k, i, (_p1+_k)[i], (#OPNAME), (_p3+_k)[i], (_p2+_k)[i]); \
DistroBaker 51ea11
+                    correctness = 0; \
DistroBaker 51ea11
+                    break; \
DistroBaker 51ea11
+                } \
DistroBaker 51ea11
             } \
DistroBaker 51ea11
         } \
DistroBaker 51ea11
     } \
DistroBaker 51ea11
@@ -139,20 +151,24 @@ do { \
DistroBaker 51ea11
     const TYPE *_p1 = ((TYPE*)(INBUF)), *_p3 = ((TYPE*)(CHECK_BUF)); \
DistroBaker 51ea11
     TYPE *_p2 = ((TYPE*)(INOUT_BUF)); \
DistroBaker 51ea11
     skip_op_type = 0; \
DistroBaker 51ea11
-    for(int _k = 0; _k < min((COUNT), 4); +_k++ ) { \
DistroBaker 51ea11
-        memcpy(_p2, _p3, sizeof(TYPE) * (COUNT)); \
DistroBaker 51ea11
-        tstart = MPI_Wtime(); \
DistroBaker 51ea11
-        MPI_Reduce_local(_p1+_k, _p2+_k, (COUNT), (MPITYPE), (MPIOP)); \
DistroBaker 51ea11
-        tend = MPI_Wtime(); \
DistroBaker 51ea11
-        if( check ) { \
DistroBaker 51ea11
-            for( i = 0; i < (COUNT); i++ ) { \
DistroBaker 51ea11
-                TYPE _v1 = *(_p1+_k), _v2 = *(_p2+_k), _v3 = *(_p3+_k); \
DistroBaker 51ea11
-                if(_v2 == OPNAME(_v1, _v3)) \
DistroBaker 51ea11
-                    continue; \
DistroBaker 51ea11
-                printf("First error at alignment %d position %d (%" TYPE_PREFIX " !=  %s(%" TYPE_PREFIX ", %" TYPE_PREFIX ")\n", \
DistroBaker 51ea11
-                       _k, i, _v1, (#OPNAME), _v3, _v2); \
DistroBaker 51ea11
-                correctness = 0; \
DistroBaker 51ea11
-                break; \
DistroBaker 51ea11
+    for(int _k = 0; _k < min((COUNT), max_shift); +_k++ ) { \
DistroBaker 51ea11
+        duration[_k] = 0.0; \
DistroBaker 51ea11
+        for(int _r = repeats; _r > 0; _r--) { \
DistroBaker 51ea11
+            memcpy(_p2, _p3, sizeof(TYPE) * (COUNT)); \
DistroBaker 51ea11
+            tstart = MPI_Wtime(); \
DistroBaker 51ea11
+            MPI_Reduce_local(_p1+_k, _p2+_k, (COUNT), (MPITYPE), (MPIOP)); \
DistroBaker 51ea11
+            tend = MPI_Wtime(); \
DistroBaker 51ea11
+            duration[_k] += (tend - tstart); \
DistroBaker 51ea11
+            if( check ) { \
DistroBaker 51ea11
+                for( i = 0; i < (COUNT); i++ ) { \
DistroBaker 51ea11
+                    TYPE _v1 = *(_p1+_k), _v2 = *(_p2+_k), _v3 = *(_p3+_k); \
DistroBaker 51ea11
+                    if(_v2 == OPNAME(_v1, _v3)) \
DistroBaker 51ea11
+                        continue; \
DistroBaker 51ea11
+                    printf("First error at alignment %d position %d (%" TYPE_PREFIX " !=  %s(%" TYPE_PREFIX ", %" TYPE_PREFIX ")\n", \
DistroBaker 51ea11
+                           _k, i, _v1, (#OPNAME), _v3, _v2); \
DistroBaker 51ea11
+                    correctness = 0; \
DistroBaker 51ea11
+                    break; \
DistroBaker 51ea11
+                } \
DistroBaker 51ea11
             } \
DistroBaker 51ea11
         } \
DistroBaker 51ea11
     } \
DistroBaker 51ea11
@@ -163,24 +179,36 @@ int main(int argc, char **argv)
DistroBaker 51ea11
 {
DistroBaker 51ea11
     static void *in_buf = NULL, *inout_buf = NULL, *inout_check_buf = NULL;
DistroBaker 51ea11
     int count, type_size = 8, rank, size, provided, correctness = 1;
DistroBaker 51ea11
-    int repeats = 1, i, c;
DistroBaker 51ea11
-    double tstart, tend;
DistroBaker 51ea11
+    int repeats = 1, i, c, op1_alignment = 0, res_alignment = 0;
DistroBaker 51ea11
+    int max_shift = 4;
DistroBaker 51ea11
+    double *duration, tstart, tend;
DistroBaker 51ea11
     bool check = true;
DistroBaker 51ea11
     char type[5] = "uifd", *op = "sum", *mpi_type;
DistroBaker 51ea11
     int lower = 1, upper = 1000000, skip_op_type;
DistroBaker 51ea11
     MPI_Op mpi_op;
DistroBaker 51ea11
 
DistroBaker 51ea11
-    while( -1 != (c = getopt(argc, argv, "l:u:t:o:s:n:vfh")) ) {
DistroBaker 51ea11
+    while( -1 != (c = getopt(argc, argv, "l:u:r:t:o:i:s:n:1:2:vfh")) ) {
DistroBaker 51ea11
         switch(c) {
DistroBaker 51ea11
         case 'l':
DistroBaker 51ea11
             lower = atoi(optarg);
DistroBaker 51ea11
             if( lower <= 0 ) {
DistroBaker 51ea11
-                fprintf(stderr, "The number of elements must be positive\n");
DistroBaker 51ea11
+                fprintf(stderr, "The lower number of elements must be positive\n");
DistroBaker 51ea11
                 exit(-1);
DistroBaker 51ea11
             }
DistroBaker 51ea11
             break;
DistroBaker 51ea11
         case 'u':
DistroBaker 51ea11
             upper = atoi(optarg);
DistroBaker 51ea11
+            if( lower <= 0 ) {
DistroBaker 51ea11
+                fprintf(stderr, "The upper number of elements must be positive\n");
DistroBaker 51ea11
+                exit(-1);
DistroBaker 51ea11
+            }
DistroBaker 51ea11
+            break;
DistroBaker 51ea11
+        case 'i':
DistroBaker 51ea11
+            max_shift = atoi(optarg);
DistroBaker 51ea11
+            if( max_shift <= 0 ) {
DistroBaker 51ea11
+                fprintf(stderr, "The max shift must be positive\n");
DistroBaker 51ea11
+                exit(-1);
DistroBaker 51ea11
+            }
DistroBaker 51ea11
             break;
DistroBaker 51ea11
         case 'f':
DistroBaker 51ea11
             check = false;
DistroBaker 51ea11
@@ -216,14 +244,32 @@ int main(int argc, char **argv)
DistroBaker 51ea11
                 exit(-1);
DistroBaker 51ea11
             }
DistroBaker 51ea11
             break;
DistroBaker 51ea11
+        case '1':
DistroBaker 51ea11
+            op1_alignment = atoi(optarg);
DistroBaker 51ea11
+            if( op1_alignment < 0 ) {
DistroBaker 51ea11
+                fprintf(stderr, "alignment for the first operand must be positive\n");
DistroBaker 51ea11
+                exit(-1);
DistroBaker 51ea11
+            }
DistroBaker 51ea11
+            break;
DistroBaker 51ea11
+        case '2':
DistroBaker 51ea11
+            res_alignment = atoi(optarg);
DistroBaker 51ea11
+            if( res_alignment < 0 ) {
DistroBaker 51ea11
+                fprintf(stderr, "alignment for the result must be positive\n");
DistroBaker 51ea11
+                exit(-1);
DistroBaker 51ea11
+            }
DistroBaker 51ea11
+            break;
DistroBaker 51ea11
         case 'h':
DistroBaker 51ea11
             fprintf(stdout, "%s options are:\n"
DistroBaker 51ea11
                     " -l <number> : lower number of elements\n"
DistroBaker 51ea11
                     " -u <number> : upper number of elements\n"
DistroBaker 51ea11
                     " -s <type_size> : 8, 16, 32 or 64 bits elements\n"
DistroBaker 51ea11
                     " -t [i,u,f,d] : type of the elements to apply the operations on\n"
DistroBaker 51ea11
+                    " -r <number> : number of repetitions for each test\n"
DistroBaker 51ea11
                     " -o <op> : comma separated list of operations to execute among\n"
DistroBaker 51ea11
                     "           sum, min, max, prod, bor, bxor, band\n"
DistroBaker 51ea11
+                    " -i <number> : shift on all buffers to check alignment\n"
DistroBaker 51ea11
+                    " -1 <number> : (mis)alignment in elements for the first op\n"
DistroBaker 51ea11
+                    " -2 <number> : (mis)alignment in elements for the result\n"
DistroBaker 51ea11
                     " -v: increase the verbosity level\n"
DistroBaker 51ea11
                     " -h: this help message\n", argv[0]);
DistroBaker 51ea11
             exit(0);
DistroBaker 51ea11
@@ -233,9 +279,10 @@ int main(int argc, char **argv)
DistroBaker 51ea11
     if( !do_ops_built ) {  /* not yet done, take the default */
DistroBaker 51ea11
             build_do_ops( "all", do_ops);
DistroBaker 51ea11
     }
DistroBaker 51ea11
-    in_buf          = malloc(upper * sizeof(double));
DistroBaker 51ea11
-    inout_buf       = malloc(upper * sizeof(double));
DistroBaker 51ea11
-    inout_check_buf = malloc(upper * sizeof(double));
DistroBaker 51ea11
+    posix_memalign( &in_buf,          64, (upper + op1_alignment) * sizeof(double));
DistroBaker 51ea11
+    posix_memalign( &inout_buf,       64, (upper + res_alignment) * sizeof(double));
DistroBaker 51ea11
+    posix_memalign( &inout_check_buf, 64, upper * sizeof(double));
DistroBaker 51ea11
+    duration = (double*)malloc(max_shift * sizeof(double));
DistroBaker 51ea11
 
DistroBaker 51ea11
     ompi_mpi_init(argc, argv, MPI_THREAD_SERIALIZED, &provided, false);
DistroBaker 51ea11
 
DistroBaker 51ea11
@@ -253,8 +300,8 @@ int main(int argc, char **argv)
DistroBaker 51ea11
                 correctness = 1;
DistroBaker 51ea11
                 if('i' == type[type_idx]) {
DistroBaker 51ea11
                     if( 8 == type_size ) {
DistroBaker 51ea11
-                        int8_t *in_int8 = (int8_t*)in_buf,
DistroBaker 51ea11
-                            *inout_int8 = (int8_t*)inout_buf,
DistroBaker 51ea11
+                        int8_t *in_int8 = (int8_t*)((char*)in_buf + op1_alignment * sizeof(int8_t)),
DistroBaker 51ea11
+                            *inout_int8 = (int8_t*)((char*)inout_buf + res_alignment * sizeof(int8_t)),
DistroBaker 51ea11
                             *inout_int8_for_check = (int8_t*)inout_check_buf;
DistroBaker 51ea11
                         for( i = 0; i < count; i++ ) {
DistroBaker 51ea11
                             in_int8[i] = 5;
DistroBaker 51ea11
@@ -299,8 +346,8 @@ int main(int argc, char **argv)
DistroBaker 51ea11
                         }
DistroBaker 51ea11
                     }
DistroBaker 51ea11
                     if( 16 == type_size ) {
DistroBaker 51ea11
-                        int16_t *in_int16 = (int16_t*)in_buf,
DistroBaker 51ea11
-                            *inout_int16 = (int16_t*)inout_buf,
DistroBaker 51ea11
+                        int16_t *in_int16 = (int16_t*)((char*)in_buf + op1_alignment * sizeof(int16_t)),
DistroBaker 51ea11
+                            *inout_int16 = (int16_t*)((char*)inout_buf + res_alignment * sizeof(int16_t)),
DistroBaker 51ea11
                             *inout_int16_for_check = (int16_t*)inout_check_buf;
DistroBaker 51ea11
                         for( i = 0; i < count; i++ ) {
DistroBaker 51ea11
                             in_int16[i] = 5;
DistroBaker 51ea11
@@ -345,8 +392,8 @@ int main(int argc, char **argv)
DistroBaker 51ea11
                         }
DistroBaker 51ea11
                     }
DistroBaker 51ea11
                     if( 32 == type_size ) {
DistroBaker 51ea11
-                        int32_t *in_int32 = (int32_t*)in_buf,
DistroBaker 51ea11
-                            *inout_int32 = (int32_t*)inout_buf,
DistroBaker 51ea11
+                        int32_t *in_int32 = (int32_t*)((char*)in_buf + op1_alignment * sizeof(int32_t)),
DistroBaker 51ea11
+                            *inout_int32 = (int32_t*)((char*)inout_buf + res_alignment * sizeof(int32_t)),
DistroBaker 51ea11
                             *inout_int32_for_check = (int32_t*)inout_check_buf;
DistroBaker 51ea11
                         for( i = 0; i < count; i++ ) {
DistroBaker 51ea11
                             in_int32[i] = 5;
DistroBaker 51ea11
@@ -391,8 +438,8 @@ int main(int argc, char **argv)
DistroBaker 51ea11
                         }
DistroBaker 51ea11
                     }
DistroBaker 51ea11
                     if( 64 == type_size ) {
DistroBaker 51ea11
-                        int64_t *in_int64 = (int64_t*)in_buf,
DistroBaker 51ea11
-                            *inout_int64 = (int64_t*)inout_buf,
DistroBaker 51ea11
+                        int64_t *in_int64 = (int64_t*)((char*)in_buf + op1_alignment * sizeof(int64_t)),
DistroBaker 51ea11
+                            *inout_int64 = (int64_t*)((char*)inout_buf + res_alignment * sizeof(int64_t)),
DistroBaker 51ea11
                             *inout_int64_for_check = (int64_t*)inout_check_buf;
DistroBaker 51ea11
                         for( i = 0; i < count; i++ ) {
DistroBaker 51ea11
                             in_int64[i] = 5;
DistroBaker 51ea11
@@ -440,8 +487,8 @@ int main(int argc, char **argv)
DistroBaker 51ea11
 
DistroBaker 51ea11
                 if( 'u' == type[type_idx] ) {
DistroBaker 51ea11
                     if( 8 == type_size ) {
DistroBaker 51ea11
-                        uint8_t *in_uint8 = (uint8_t*)in_buf,
DistroBaker 51ea11
-                            *inout_uint8 = (uint8_t*)inout_buf,
DistroBaker 51ea11
+                        uint8_t *in_uint8 = (uint8_t*)((char*)in_buf + op1_alignment * sizeof(uint8_t)),
DistroBaker 51ea11
+                            *inout_uint8 = (uint8_t*)((char*)inout_buf + res_alignment * sizeof(uint8_t)),
DistroBaker 51ea11
                             *inout_uint8_for_check = (uint8_t*)inout_check_buf;
DistroBaker 51ea11
                         for( i = 0; i < count; i++ ) {
DistroBaker 51ea11
                             in_uint8[i] = 5;
DistroBaker 51ea11
@@ -486,8 +533,8 @@ int main(int argc, char **argv)
DistroBaker 51ea11
                         }
DistroBaker 51ea11
                     }
DistroBaker 51ea11
                     if( 16 == type_size ) {
DistroBaker 51ea11
-                        uint16_t *in_uint16 = (uint16_t*)in_buf,
DistroBaker 51ea11
-                            *inout_uint16 = (uint16_t*)inout_buf,
DistroBaker 51ea11
+                        uint16_t *in_uint16 = (uint16_t*)((char*)in_buf + op1_alignment * sizeof(uint16_t)),
DistroBaker 51ea11
+                            *inout_uint16 = (uint16_t*)((char*)inout_buf + res_alignment * sizeof(uint16_t)),
DistroBaker 51ea11
                             *inout_uint16_for_check = (uint16_t*)inout_check_buf;
DistroBaker 51ea11
                         for( i = 0; i < count; i++ ) {
DistroBaker 51ea11
                             in_uint16[i] = 5;
DistroBaker 51ea11
@@ -532,8 +579,8 @@ int main(int argc, char **argv)
DistroBaker 51ea11
                         }
DistroBaker 51ea11
                     }
DistroBaker 51ea11
                     if( 32 == type_size ) {
DistroBaker 51ea11
-                        uint32_t *in_uint32 = (uint32_t*)in_buf,
DistroBaker 51ea11
-                            *inout_uint32 = (uint32_t*)inout_buf,
DistroBaker 51ea11
+                        uint32_t *in_uint32 = (uint32_t*)((char*)in_buf + op1_alignment * sizeof(uint32_t)),
DistroBaker 51ea11
+                            *inout_uint32 = (uint32_t*)((char*)inout_buf + res_alignment * sizeof(uint32_t)),
DistroBaker 51ea11
                             *inout_uint32_for_check = (uint32_t*)inout_check_buf;
DistroBaker 51ea11
                         for( i = 0; i < count; i++ ) {
DistroBaker 51ea11
                             in_uint32[i] = 5;
DistroBaker 51ea11
@@ -578,8 +625,8 @@ int main(int argc, char **argv)
DistroBaker 51ea11
                         }
DistroBaker 51ea11
                     }
DistroBaker 51ea11
                     if( 64 == type_size ) {
DistroBaker 51ea11
-                        uint64_t *in_uint64 = (uint64_t*)in_buf,
DistroBaker 51ea11
-                              *inout_uint64 = (uint64_t*)inout_buf,
DistroBaker 51ea11
+                        uint64_t *in_uint64 = (uint64_t*)((char*)in_buf + op1_alignment * sizeof(uint64_t)),
DistroBaker 51ea11
+                              *inout_uint64 = (uint64_t*)((char*)inout_buf + res_alignment * sizeof(uint64_t)),
DistroBaker 51ea11
                             *inout_uint64_for_check = (uint64_t*)inout_check_buf;
DistroBaker 51ea11
                         for( i = 0; i < count; i++ ) {
DistroBaker 51ea11
                             in_uint64[i] = 5;
DistroBaker 51ea11
@@ -626,8 +673,8 @@ int main(int argc, char **argv)
DistroBaker 51ea11
                 }
DistroBaker 51ea11
 
DistroBaker 51ea11
                 if( 'f' == type[type_idx] ) {
DistroBaker 51ea11
-                    float *in_float = (float*)in_buf,
DistroBaker 51ea11
-                        *inout_float = (float*)inout_buf,
DistroBaker 51ea11
+                    float *in_float = (float*)((char*)in_buf + op1_alignment * sizeof(float)),
DistroBaker 51ea11
+                        *inout_float = (float*)((char*)inout_buf + res_alignment * sizeof(float)),
DistroBaker 51ea11
                         *inout_float_for_check = (float*)inout_check_buf;
DistroBaker 51ea11
                     for( i = 0; i < count; i++ ) {
DistroBaker 51ea11
                         in_float[i] = 1000.0+1;
DistroBaker 51ea11
@@ -658,8 +705,8 @@ int main(int argc, char **argv)
DistroBaker 51ea11
                 }
DistroBaker 51ea11
 
DistroBaker 51ea11
                 if( 'd' == type[type_idx] ) {
DistroBaker 51ea11
-                    double *in_double = (double*)in_buf,
DistroBaker 51ea11
-                        *inout_double = (double*)inout_buf,
DistroBaker 51ea11
+                    double *in_double = (double*)((char*)in_buf + op1_alignment * sizeof(double)),
DistroBaker 51ea11
+                        *inout_double = (double*)((char*)inout_buf + res_alignment * sizeof(double)),
DistroBaker 51ea11
                         *inout_double_for_check = (double*)inout_check_buf;
DistroBaker 51ea11
                     for( i = 0; i < count; i++ ) {
DistroBaker 51ea11
                         in_double[i] = 10.0+1;
DistroBaker 51ea11
@@ -691,7 +738,7 @@ int main(int argc, char **argv)
DistroBaker 51ea11
         check_and_continue:
DistroBaker 51ea11
                 if( !skip_op_type )
DistroBaker 51ea11
                     print_status(array_of_ops[do_ops[op_idx]].mpi_op_name,
DistroBaker 51ea11
-                                 mpi_type, type_size, count, tend-tstart, correctness);
DistroBaker 51ea11
+                                 mpi_type, type_size, count, max_shift, duration, repeats, correctness);
DistroBaker 51ea11
             }
DistroBaker 51ea11
             if( !skip_op_type )
DistroBaker 51ea11
                 printf("\n");