1683 lines
85 KiB
Diff
1683 lines
85 KiB
Diff
|
From 6de8dfd236b7180b04c911d2306c3b805abf7182 Mon Sep 17 00:00:00 2001
|
||
|
From: George Bosilca <bosilca@icl.utk.edu>
|
||
|
Date: Mon, 28 Dec 2020 15:36:05 -0500
|
||
|
Subject: [PATCH 1/3] Major update to the AVX* detection and support
|
||
|
|
||
|
1. Consistent march flag order between configure and make.
|
||
|
|
||
|
2. op/avx: give the option to skip some tests
|
||
|
|
||
|
it is possible to skip some intrinsic tests by setting some environment variables to "no" before invoking configure:
|
||
|
- ompi_cv_op_avx_check_avx512
|
||
|
- ompi_cv_op_avx_check_avx2
|
||
|
- ompi_cv_op_avx_check_avx
|
||
|
- ompi_cv_op_avx_check_sse41
|
||
|
- ompi_cv_op_avx_check_sse3
|
||
|
|
||
|
3. op/avx: update AVX512 flags
|
||
|
|
||
|
try
|
||
|
-mavx512f -mavx512bw -mavx512vl -mavx512dq
|
||
|
instead of
|
||
|
-march=skylake-avx512
|
||
|
|
||
|
since the former is less likely to conflict with user provided CFLAGS
|
||
|
(e.g. -march=...)
|
||
|
|
||
|
Thanks Bart Oldeman for pointing this.
|
||
|
|
||
|
4. op/avx: have the op/avx library depend on libmpi.so
|
||
|
|
||
|
Refs. open-mpi/ompi#8323
|
||
|
|
||
|
Signed-off-by: Gilles Gouaillardet <gilles@rist.or.jp>
|
||
|
Signed-off-by: George Bosilca <bosilca@icl.utk.edu>
|
||
|
---
|
||
|
ompi/mca/op/avx/Makefile.am | 4 +-
|
||
|
ompi/mca/op/avx/configure.m4 | 325 ++++++++++++++++++-----------------
|
||
|
2 files changed, 174 insertions(+), 155 deletions(-)
|
||
|
|
||
|
diff --git ompi/mca/op/avx/Makefile.am ompi/mca/op/avx/Makefile.am
|
||
|
index 41dcf2e1834..b1d84d90b33 100644
|
||
|
--- ompi/mca/op/avx/Makefile.am
|
||
|
+++ ompi/mca/op/avx/Makefile.am
|
||
|
@@ -2,7 +2,7 @@
|
||
|
# Copyright (c) 2019-2020 The University of Tennessee and The University
|
||
|
# of Tennessee Research Foundation. All rights
|
||
|
# reserved.
|
||
|
-# Copyright (c) 2020 Research Organization for Information Science
|
||
|
+# Copyright (c) 2020-2021 Research Organization for Information Science
|
||
|
# and Technology (RIST). All rights reserved.
|
||
|
# $COPYRIGHT$
|
||
|
#
|
||
|
@@ -86,7 +86,7 @@ mcacomponentdir = $(ompilibdir)
|
||
|
mcacomponent_LTLIBRARIES = $(component_install)
|
||
|
mca_op_avx_la_SOURCES = $(sources)
|
||
|
mca_op_avx_la_LIBADD = $(specialized_op_libs)
|
||
|
-mca_op_avx_la_LDFLAGS = -module -avoid-version
|
||
|
+mca_op_avx_la_LDFLAGS = -module -avoid-version $(top_builddir)/ompi/lib@OMPI_LIBMPI_NAME@.la
|
||
|
|
||
|
|
||
|
# Specific information for static builds.
|
||
|
diff --git ompi/mca/op/avx/configure.m4 ompi/mca/op/avx/configure.m4
|
||
|
index 09d8b374c8e..f61b7100ef4 100644
|
||
|
--- ompi/mca/op/avx/configure.m4
|
||
|
+++ ompi/mca/op/avx/configure.m4
|
||
|
@@ -29,6 +29,13 @@ AC_DEFUN([MCA_ompi_op_avx_CONFIG],[
|
||
|
op_avx_support=0
|
||
|
op_avx2_support=0
|
||
|
op_avx512_support=0
|
||
|
+
|
||
|
+ AS_VAR_PUSHDEF([op_avx_check_sse3], [ompi_cv_op_avx_check_sse3])
|
||
|
+ AS_VAR_PUSHDEF([op_avx_check_sse41], [ompi_cv_op_avx_check_sse41])
|
||
|
+ AS_VAR_PUSHDEF([op_avx_check_avx], [ompi_cv_op_avx_check_avx])
|
||
|
+ AS_VAR_PUSHDEF([op_avx_check_avx2], [ompi_cv_op_avx_check_avx2])
|
||
|
+ AS_VAR_PUSHDEF([op_avx_check_avx512], [ompi_cv_op_avx_check_avx512])
|
||
|
+
|
||
|
OPAL_VAR_SCOPE_PUSH([op_avx_cflags_save])
|
||
|
|
||
|
AS_IF([test "$opal_cv_asm_arch" = "X86_64"],
|
||
|
@@ -37,21 +44,9 @@ AC_DEFUN([MCA_ompi_op_avx_CONFIG],[
|
||
|
#
|
||
|
# Check for AVX512 support
|
||
|
#
|
||
|
- AC_MSG_CHECKING([for AVX512 support (no additional flags)])
|
||
|
- AC_LINK_IFELSE(
|
||
|
- [AC_LANG_PROGRAM([[#include <immintrin.h>]],
|
||
|
- [[
|
||
|
- __m512 vA, vB;
|
||
|
- _mm512_add_ps(vA, vB)
|
||
|
- ]])],
|
||
|
- [op_avx512_support=1
|
||
|
- AC_MSG_RESULT([yes])],
|
||
|
- [AC_MSG_RESULT([no])])
|
||
|
-
|
||
|
- AS_IF([test $op_avx512_support -eq 0],
|
||
|
- [AC_MSG_CHECKING([for AVX512 support (with -march=skylake-avx512)])
|
||
|
- op_avx_cflags_save="$CFLAGS"
|
||
|
- CFLAGS="$CFLAGS -march=skylake-avx512"
|
||
|
+ AC_CACHE_CHECK([if we are checking for AVX512 support], op_avx_check_avx512, AS_VAR_SET(op_avx_check_avx512, yes))
|
||
|
+ AS_IF([test "$op_avx_check_avx512" = "yes"],
|
||
|
+ [AC_MSG_CHECKING([for AVX512 support (no additional flags)])
|
||
|
AC_LINK_IFELSE(
|
||
|
[AC_LANG_PROGRAM([[#include <immintrin.h>]],
|
||
|
[[
|
||
|
@@ -59,99 +54,115 @@ AC_DEFUN([MCA_ompi_op_avx_CONFIG],[
|
||
|
_mm512_add_ps(vA, vB)
|
||
|
]])],
|
||
|
[op_avx512_support=1
|
||
|
- MCA_BUILD_OP_AVX512_FLAGS="-march=skylake-avx512"
|
||
|
AC_MSG_RESULT([yes])],
|
||
|
[AC_MSG_RESULT([no])])
|
||
|
- CFLAGS="$op_avx_cflags_save"
|
||
|
- ])
|
||
|
- #
|
||
|
- # Some combination of gcc and older as would not correctly build the code generated by
|
||
|
- # _mm256_loadu_si256. Screen them out.
|
||
|
- #
|
||
|
- AS_IF([test $op_avx512_support -eq 1],
|
||
|
- [AC_MSG_CHECKING([if _mm512_loadu_si512 generates code that can be compiled])
|
||
|
- op_avx_cflags_save="$CFLAGS"
|
||
|
- CFLAGS="$CFLAGS_WITHOUT_OPTFLAGS -O0 $MCA_BUILD_OP_AVX512_FLAGS"
|
||
|
- AC_LINK_IFELSE(
|
||
|
- [AC_LANG_PROGRAM([[#include <immintrin.h>]],
|
||
|
- [[
|
||
|
+
|
||
|
+ AS_IF([test $op_avx512_support -eq 0],
|
||
|
+ [AC_MSG_CHECKING([for AVX512 support (with -mavx512f -mavx512bw -mavx512vl -mavx512dq)])
|
||
|
+ op_avx_cflags_save="$CFLAGS"
|
||
|
+ CFLAGS="-mavx512f -mavx512bw -mavx512vl -mavx512dq $CFLAGS"
|
||
|
+ AC_LINK_IFELSE(
|
||
|
+ [AC_LANG_PROGRAM([[#include <immintrin.h>]],
|
||
|
+ [[
|
||
|
+ __m512 vA, vB;
|
||
|
+ _mm512_add_ps(vA, vB)
|
||
|
+ ]])],
|
||
|
+ [op_avx512_support=1
|
||
|
+ MCA_BUILD_OP_AVX512_FLAGS="-mavx512f -mavx512bw -mavx512vl -mavx512dq"
|
||
|
+ AC_MSG_RESULT([yes])],
|
||
|
+ [AC_MSG_RESULT([no])])
|
||
|
+ CFLAGS="$op_avx_cflags_save"
|
||
|
+ ])
|
||
|
+ #
|
||
|
+ # Some combination of gcc and older as would not correctly build the code generated by
|
||
|
+ # _mm256_loadu_si256. Screen them out.
|
||
|
+ #
|
||
|
+ AS_IF([test $op_avx512_support -eq 1],
|
||
|
+ [AC_MSG_CHECKING([if _mm512_loadu_si512 generates code that can be compiled])
|
||
|
+ op_avx_cflags_save="$CFLAGS"
|
||
|
+ CFLAGS="$CFLAGS_WITHOUT_OPTFLAGS -O0 $MCA_BUILD_OP_AVX512_FLAGS"
|
||
|
+ AC_LINK_IFELSE(
|
||
|
+ [AC_LANG_PROGRAM([[#include <immintrin.h>]],
|
||
|
+ [[
|
||
|
int A[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16};
|
||
|
__m512i vA = _mm512_loadu_si512((__m512i*)&(A[1]))
|
||
|
- ]])],
|
||
|
- [AC_MSG_RESULT([yes])],
|
||
|
- [op_avx512_support=0
|
||
|
- MCA_BUILD_OP_AVX512_FLAGS=""
|
||
|
- AC_MSG_RESULT([no])])
|
||
|
- CFLAGS="$op_avx_cflags_save"
|
||
|
- ])
|
||
|
- #
|
||
|
- # Some PGI compilers do not define _mm512_mullo_epi64. Screen them out.
|
||
|
- #
|
||
|
- AS_IF([test $op_avx512_support -eq 1],
|
||
|
- [AC_MSG_CHECKING([if _mm512_mullo_epi64 generates code that can be compiled])
|
||
|
- op_avx_cflags_save="$CFLAGS"
|
||
|
- CFLAGS="$CFLAGS_WITHOUT_OPTFLAGS -O0 $MCA_BUILD_OP_AVX512_FLAGS"
|
||
|
- AC_LINK_IFELSE(
|
||
|
- [AC_LANG_PROGRAM([[#include <immintrin.h>]],
|
||
|
- [[
|
||
|
+ ]])],
|
||
|
+ [AC_MSG_RESULT([yes])],
|
||
|
+ [op_avx512_support=0
|
||
|
+ MCA_BUILD_OP_AVX512_FLAGS=""
|
||
|
+ AC_MSG_RESULT([no])])
|
||
|
+ CFLAGS="$op_avx_cflags_save"
|
||
|
+ ])
|
||
|
+ #
|
||
|
+ # Some PGI compilers do not define _mm512_mullo_epi64. Screen them out.
|
||
|
+ #
|
||
|
+ AS_IF([test $op_avx512_support -eq 1],
|
||
|
+ [AC_MSG_CHECKING([if _mm512_mullo_epi64 generates code that can be compiled])
|
||
|
+ op_avx_cflags_save="$CFLAGS"
|
||
|
+ CFLAGS="$CFLAGS_WITHOUT_OPTFLAGS -O0 $MCA_BUILD_OP_AVX512_FLAGS"
|
||
|
+ AC_LINK_IFELSE(
|
||
|
+ [AC_LANG_PROGRAM([[#include <immintrin.h>]],
|
||
|
+ [[
|
||
|
__m512i vA, vB;
|
||
|
_mm512_mullo_epi64(vA, vB)
|
||
|
- ]])],
|
||
|
- [AC_MSG_RESULT([yes])],
|
||
|
- [op_avx512_support=0
|
||
|
- MCA_BUILD_OP_AVX512_FLAGS=""
|
||
|
- AC_MSG_RESULT([no])])
|
||
|
- CFLAGS="$op_avx_cflags_save"
|
||
|
- ])
|
||
|
+ ]])],
|
||
|
+ [AC_MSG_RESULT([yes])],
|
||
|
+ [op_avx512_support=0
|
||
|
+ MCA_BUILD_OP_AVX512_FLAGS=""
|
||
|
+ AC_MSG_RESULT([no])])
|
||
|
+ CFLAGS="$op_avx_cflags_save"
|
||
|
+ ])])
|
||
|
#
|
||
|
# Check support for AVX2
|
||
|
#
|
||
|
- AC_MSG_CHECKING([for AVX2 support (no additional flags)])
|
||
|
- AC_LINK_IFELSE(
|
||
|
- [AC_LANG_PROGRAM([[#include <immintrin.h>]],
|
||
|
- [[
|
||
|
+ AC_CACHE_CHECK([if we are checking for AVX2 support], op_avx_check_avx2, AS_VAR_SET(op_avx_check_avx2, yes))
|
||
|
+ AS_IF([test "$op_avx_check_avx2" = "yes"],
|
||
|
+ [AC_MSG_CHECKING([for AVX2 support (no additional flags)])
|
||
|
+ AC_LINK_IFELSE(
|
||
|
+ [AC_LANG_PROGRAM([[#include <immintrin.h>]],
|
||
|
+ [[
|
||
|
__m256 vA, vB;
|
||
|
_mm256_add_ps(vA, vB)
|
||
|
- ]])],
|
||
|
- [op_avx2_support=1
|
||
|
- AC_MSG_RESULT([yes])],
|
||
|
- [AC_MSG_RESULT([no])])
|
||
|
- AS_IF([test $op_avx2_support -eq 0],
|
||
|
- [AC_MSG_CHECKING([for AVX2 support (with -mavx2)])
|
||
|
- op_avx_cflags_save="$CFLAGS"
|
||
|
- CFLAGS="$CFLAGS -mavx2"
|
||
|
- AC_LINK_IFELSE(
|
||
|
- [AC_LANG_PROGRAM([[#include <immintrin.h>]],
|
||
|
- [[
|
||
|
+ ]])],
|
||
|
+ [op_avx2_support=1
|
||
|
+ AC_MSG_RESULT([yes])],
|
||
|
+ [AC_MSG_RESULT([no])])
|
||
|
+ AS_IF([test $op_avx2_support -eq 0],
|
||
|
+ [AC_MSG_CHECKING([for AVX2 support (with -mavx2)])
|
||
|
+ op_avx_cflags_save="$CFLAGS"
|
||
|
+ CFLAGS="-mavx2 $CFLAGS"
|
||
|
+ AC_LINK_IFELSE(
|
||
|
+ [AC_LANG_PROGRAM([[#include <immintrin.h>]],
|
||
|
+ [[
|
||
|
__m256 vA, vB;
|
||
|
_mm256_add_ps(vA, vB)
|
||
|
- ]])],
|
||
|
- [op_avx2_support=1
|
||
|
- MCA_BUILD_OP_AVX2_FLAGS="-mavx2"
|
||
|
- AC_MSG_RESULT([yes])],
|
||
|
- [AC_MSG_RESULT([no])])
|
||
|
- CFLAGS="$op_avx_cflags_save"
|
||
|
- ])
|
||
|
- #
|
||
|
- # Some combination of gcc and older as would not correctly build the code generated by
|
||
|
- # _mm256_loadu_si256. Screen them out.
|
||
|
- #
|
||
|
- AS_IF([test $op_avx2_support -eq 1],
|
||
|
- [AC_MSG_CHECKING([if _mm256_loadu_si256 generates code that can be compiled])
|
||
|
- op_avx_cflags_save="$CFLAGS"
|
||
|
- CFLAGS="$CFLAGS_WITHOUT_OPTFLAGS -O0 $MCA_BUILD_OP_AVX2_FLAGS"
|
||
|
- AC_LINK_IFELSE(
|
||
|
- [AC_LANG_PROGRAM([[#include <immintrin.h>]],
|
||
|
- [[
|
||
|
+ ]])],
|
||
|
+ [op_avx2_support=1
|
||
|
+ MCA_BUILD_OP_AVX2_FLAGS="-mavx2"
|
||
|
+ AC_MSG_RESULT([yes])],
|
||
|
+ [AC_MSG_RESULT([no])])
|
||
|
+ CFLAGS="$op_avx_cflags_save"
|
||
|
+ ])
|
||
|
+ #
|
||
|
+ # Some combination of gcc and older as would not correctly build the code generated by
|
||
|
+ # _mm256_loadu_si256. Screen them out.
|
||
|
+ #
|
||
|
+ AS_IF([test $op_avx2_support -eq 1],
|
||
|
+ [AC_MSG_CHECKING([if _mm256_loadu_si256 generates code that can be compiled])
|
||
|
+ op_avx_cflags_save="$CFLAGS"
|
||
|
+ CFLAGS="$CFLAGS_WITHOUT_OPTFLAGS -O0 $MCA_BUILD_OP_AVX2_FLAGS"
|
||
|
+ AC_LINK_IFELSE(
|
||
|
+ [AC_LANG_PROGRAM([[#include <immintrin.h>]],
|
||
|
+ [[
|
||
|
int A[8] = {0, 1, 2, 3, 4, 5, 6, 7};
|
||
|
__m256i vA = _mm256_loadu_si256((__m256i*)&A)
|
||
|
- ]])],
|
||
|
- [AC_MSG_RESULT([yes])],
|
||
|
- [op_avx2_support=0
|
||
|
- MCA_BUILD_OP_AVX2_FLAGS=""
|
||
|
- AC_MSG_RESULT([no])])
|
||
|
- CFLAGS="$op_avx_cflags_save"
|
||
|
- ])
|
||
|
+ ]])],
|
||
|
+ [AC_MSG_RESULT([yes])],
|
||
|
+ [op_avx2_support=0
|
||
|
+ MCA_BUILD_OP_AVX2_FLAGS=""
|
||
|
+ AC_MSG_RESULT([no])])
|
||
|
+ CFLAGS="$op_avx_cflags_save"
|
||
|
+ ])])
|
||
|
#
|
||
|
# What about early AVX support. The rest of the logic is slightly different as
|
||
|
# we need to include some of the SSE4.1 and SSE3 instructions. So, we first check
|
||
|
@@ -160,90 +171,92 @@ AC_DEFUN([MCA_ompi_op_avx_CONFIG],[
|
||
|
# the AVX flag, and then recheck if we have support for the SSE4.1 and SSE3
|
||
|
# instructions.
|
||
|
#
|
||
|
- AC_MSG_CHECKING([for AVX support (no additional flags)])
|
||
|
- AC_LINK_IFELSE(
|
||
|
- [AC_LANG_PROGRAM([[#include <immintrin.h>]],
|
||
|
- [[
|
||
|
+ AC_CACHE_CHECK([if we are checking for AVX support], op_avx_check_avx, AS_VAR_SET(op_avx_check_avx, yes))
|
||
|
+ AS_IF([test "$op_avx_check_avx" = "yes"],
|
||
|
+ [AC_MSG_CHECKING([for AVX support (no additional flags)])
|
||
|
+ AC_LINK_IFELSE(
|
||
|
+ [AC_LANG_PROGRAM([[#include <immintrin.h>]],
|
||
|
+ [[
|
||
|
__m128 vA, vB;
|
||
|
_mm_add_ps(vA, vB)
|
||
|
- ]])],
|
||
|
- [op_avx_support=1
|
||
|
- AC_MSG_RESULT([yes])],
|
||
|
- [AC_MSG_RESULT([no])])
|
||
|
+ ]])],
|
||
|
+ [op_avx_support=1
|
||
|
+ AC_MSG_RESULT([yes])],
|
||
|
+ [AC_MSG_RESULT([no])])])
|
||
|
#
|
||
|
# Check for SSE4.1 support
|
||
|
#
|
||
|
- AS_IF([test $op_avx_support -eq 1],
|
||
|
- [AC_MSG_CHECKING([for SSE4.1 support])
|
||
|
- AC_LINK_IFELSE(
|
||
|
- [AC_LANG_PROGRAM([[#include <immintrin.h>]],
|
||
|
- [[
|
||
|
+ AC_CACHE_CHECK([if we are checking for SSE4.1 support], op_avx_check_sse41, AS_VAR_SET(op_avx_check_sse41, yes))
|
||
|
+ AS_IF([test $op_avx_support -eq 1 && test "$op_avx_check_sse41" = "yes"],
|
||
|
+ [AC_MSG_CHECKING([for SSE4.1 support])
|
||
|
+ AC_LINK_IFELSE(
|
||
|
+ [AC_LANG_PROGRAM([[#include <immintrin.h>]],
|
||
|
+ [[
|
||
|
__m128i vA, vB;
|
||
|
(void)_mm_max_epi8(vA, vB)
|
||
|
- ]])],
|
||
|
- [op_sse41_support=1
|
||
|
- AC_MSG_RESULT([yes])],
|
||
|
- [AC_MSG_RESULT([no])])
|
||
|
- ])
|
||
|
+ ]])],
|
||
|
+ [op_sse41_support=1
|
||
|
+ AC_MSG_RESULT([yes])],
|
||
|
+ [AC_MSG_RESULT([no])])
|
||
|
+ ])
|
||
|
#
|
||
|
# Check for SSE3 support
|
||
|
#
|
||
|
- AS_IF([test $op_avx_support -eq 1],
|
||
|
- [AC_MSG_CHECKING([for SSE3 support])
|
||
|
- AC_LINK_IFELSE(
|
||
|
- [AC_LANG_PROGRAM([[#include <immintrin.h>]],
|
||
|
- [[
|
||
|
+ AC_CACHE_CHECK([if we are checking for SSE3 support], op_avx_check_sse3, AS_VAR_SET(op_avx_check_sse3, yes))
|
||
|
+ AS_IF([test $op_avx_support -eq 1 && test "$op_avx_check_sse3" = "yes"],
|
||
|
+ [AC_MSG_CHECKING([for SSE3 support])
|
||
|
+ AC_LINK_IFELSE(
|
||
|
+ [AC_LANG_PROGRAM([[#include <immintrin.h>]],
|
||
|
+ [[
|
||
|
int A[4] = {0, 1, 2, 3};
|
||
|
__m128i vA = _mm_lddqu_si128((__m128i*)&A)
|
||
|
- ]])],
|
||
|
- [op_sse3_support=1
|
||
|
- AC_MSG_RESULT([yes])],
|
||
|
- [AC_MSG_RESULT([no])])
|
||
|
- ])
|
||
|
+ ]])],
|
||
|
+ [op_sse3_support=1
|
||
|
+ AC_MSG_RESULT([yes])],
|
||
|
+ [AC_MSG_RESULT([no])])
|
||
|
+ ])
|
||
|
# Second pass, do we need to add the AVX flag ?
|
||
|
AS_IF([test $op_avx_support -eq 0 || test $op_sse41_support -eq 0 || test $op_sse3_support -eq 0],
|
||
|
- [AC_MSG_CHECKING([for AVX support (with -mavx)])
|
||
|
- op_avx_cflags_save="$CFLAGS"
|
||
|
- CFLAGS="$CFLAGS -mavx"
|
||
|
- AC_LINK_IFELSE(
|
||
|
- [AC_LANG_PROGRAM([[#include <immintrin.h>]],
|
||
|
- [[
|
||
|
+ [AS_IF([test "$op_avx_check_avx" = "yes"],
|
||
|
+ [AC_MSG_CHECKING([for AVX support (with -mavx)])
|
||
|
+ op_avx_cflags_save="$CFLAGS"
|
||
|
+ CFLAGS="-mavx $CFLAGS"
|
||
|
+ AC_LINK_IFELSE(
|
||
|
+ [AC_LANG_PROGRAM([[#include <immintrin.h>]],
|
||
|
+ [[
|
||
|
__m128 vA, vB;
|
||
|
_mm_add_ps(vA, vB)
|
||
|
]])],
|
||
|
- [op_avx_support=1
|
||
|
- MCA_BUILD_OP_AVX_FLAGS="-mavx"
|
||
|
- op_sse41_support=0
|
||
|
- op_sse3_support=0
|
||
|
- AC_MSG_RESULT([yes])],
|
||
|
- [AC_MSG_RESULT([no])])
|
||
|
+ [op_avx_support=1
|
||
|
+ MCA_BUILD_OP_AVX_FLAGS="-mavx"
|
||
|
+ op_sse41_support=0
|
||
|
+ op_sse3_support=0
|
||
|
+ AC_MSG_RESULT([yes])],
|
||
|
+ [AC_MSG_RESULT([no])])])
|
||
|
|
||
|
- AS_IF([test $op_sse41_support -eq 0],
|
||
|
- [AC_MSG_CHECKING([for SSE4.1 support])
|
||
|
- AC_LINK_IFELSE(
|
||
|
- [AC_LANG_PROGRAM([[#include <immintrin.h>]],
|
||
|
- [[
|
||
|
+ AS_IF([test "$op_avx_check_sse41" = "yes" && test $op_sse41_support -eq 0],
|
||
|
+ [AC_MSG_CHECKING([for SSE4.1 support])
|
||
|
+ AC_LINK_IFELSE(
|
||
|
+ [AC_LANG_PROGRAM([[#include <immintrin.h>]],
|
||
|
+ [[
|
||
|
__m128i vA, vB;
|
||
|
(void)_mm_max_epi8(vA, vB)
|
||
|
- ]])],
|
||
|
- [op_sse41_support=1
|
||
|
- AC_MSG_RESULT([yes])],
|
||
|
- [AC_MSG_RESULT([no])])
|
||
|
- ])
|
||
|
- AS_IF([test $op_sse3_support -eq 0],
|
||
|
- [AC_MSG_CHECKING([for SSE3 support])
|
||
|
- AC_LINK_IFELSE(
|
||
|
- [AC_LANG_PROGRAM([[#include <immintrin.h>]],
|
||
|
+ ]])],
|
||
|
+ [op_sse41_support=1
|
||
|
+ AC_MSG_RESULT([yes])],
|
||
|
+ [AC_MSG_RESULT([no])])])
|
||
|
+ AS_IF([test "$op_avx_check_sse3" = "yes" && test $op_sse3_support -eq 0],
|
||
|
+ [AC_MSG_CHECKING([for SSE3 support])
|
||
|
+ AC_LINK_IFELSE(
|
||
|
+ [AC_LANG_PROGRAM([[#include <immintrin.h>]],
|
||
|
[[
|
||
|
int A[4] = {0, 1, 2, 3};
|
||
|
__m128i vA = _mm_lddqu_si128((__m128i*)&A)
|
||
|
]])],
|
||
|
- [op_sse3_support=1
|
||
|
- AC_MSG_RESULT([yes])],
|
||
|
- [AC_MSG_RESULT([no])])
|
||
|
- ])
|
||
|
- CFLAGS="$op_avx_cflags_save"
|
||
|
- ])
|
||
|
+ [op_sse3_support=1
|
||
|
+ AC_MSG_RESULT([yes])],
|
||
|
+ [AC_MSG_RESULT([no])])])
|
||
|
+ CFLAGS="$op_avx_cflags_save"])
|
||
|
|
||
|
AC_LANG_POP([C])
|
||
|
])
|
||
|
@@ -276,6 +289,12 @@ AC_DEFUN([MCA_ompi_op_avx_CONFIG],[
|
||
|
AC_SUBST(MCA_BUILD_OP_AVX2_FLAGS)
|
||
|
AC_SUBST(MCA_BUILD_OP_AVX_FLAGS)
|
||
|
|
||
|
+ AS_VAR_POPDEF([op_avx_check_avx512])
|
||
|
+ AS_VAR_POPDEF([op_avx_check_avx2])
|
||
|
+ AS_VAR_POPDEF([op_avx_check_avx])
|
||
|
+ AS_VAR_POPDEF([op_avx_check_sse41])
|
||
|
+ AS_VAR_POPDEF([op_avx_check_sse3])
|
||
|
+
|
||
|
OPAL_VAR_SCOPE_POP
|
||
|
# Enable this component iff we have at least the most basic form of support
|
||
|
# for vectorial ISA
|
||
|
|
||
|
From 1c386327da5757632392aa6c967acfbebc85c7ff Mon Sep 17 00:00:00 2001
|
||
|
From: George Bosilca <bosilca@icl.utk.edu>
|
||
|
Date: Mon, 28 Dec 2020 12:18:07 -0500
|
||
|
Subject: [PATCH 2/3] AVX code generation improvements
|
||
|
|
||
|
1. Allow fallback to a lesser AVX support during make
|
||
|
|
||
|
Due to the fact that some distro restrict the compiule architecture
|
||
|
during make (while not setting any restrictions during configure) we
|
||
|
need to detect the target architecture also during make in order to
|
||
|
restrict the code we generate.
|
||
|
|
||
|
2. Add comments and better protect the arch specific code.
|
||
|
|
||
|
Identify all the vectorial functions used and clasify them according to
|
||
|
the neccesary hardware capabilities.
|
||
|
Use these requirements to protect the code for load and stores (the rest
|
||
|
of the code being automatically generated it is more difficult to
|
||
|
protect).
|
||
|
|
||
|
3. Correctly check for AVX* support.
|
||
|
|
||
|
Signed-off-by: George Bosilca <bosilca@icl.utk.edu>
|
||
|
---
|
||
|
ompi/mca/op/avx/configure.m4 | 28 +--
|
||
|
ompi/mca/op/avx/op_avx_functions.c | 322 ++++++++++++++++++++++++-----
|
||
|
2 files changed, 288 insertions(+), 62 deletions(-)
|
||
|
|
||
|
diff --git ompi/mca/op/avx/configure.m4 ompi/mca/op/avx/configure.m4
|
||
|
index f61b7100ef4..f3651f09d43 100644
|
||
|
--- ompi/mca/op/avx/configure.m4
|
||
|
+++ ompi/mca/op/avx/configure.m4
|
||
|
@@ -44,7 +44,7 @@ AC_DEFUN([MCA_ompi_op_avx_CONFIG],[
|
||
|
#
|
||
|
# Check for AVX512 support
|
||
|
#
|
||
|
- AC_CACHE_CHECK([if we are checking for AVX512 support], op_avx_check_avx512, AS_VAR_SET(op_avx_check_avx512, yes))
|
||
|
+ AC_CACHE_CHECK([for AVX512 support], op_avx_check_avx512, AS_VAR_SET(op_avx_check_avx512, yes))
|
||
|
AS_IF([test "$op_avx_check_avx512" = "yes"],
|
||
|
[AC_MSG_CHECKING([for AVX512 support (no additional flags)])
|
||
|
AC_LINK_IFELSE(
|
||
|
@@ -115,14 +115,14 @@ AC_DEFUN([MCA_ompi_op_avx_CONFIG],[
|
||
|
#
|
||
|
# Check support for AVX2
|
||
|
#
|
||
|
- AC_CACHE_CHECK([if we are checking for AVX2 support], op_avx_check_avx2, AS_VAR_SET(op_avx_check_avx2, yes))
|
||
|
+ AC_CACHE_CHECK([for AVX2 support], op_avx_check_avx2, AS_VAR_SET(op_avx_check_avx2, yes))
|
||
|
AS_IF([test "$op_avx_check_avx2" = "yes"],
|
||
|
[AC_MSG_CHECKING([for AVX2 support (no additional flags)])
|
||
|
AC_LINK_IFELSE(
|
||
|
[AC_LANG_PROGRAM([[#include <immintrin.h>]],
|
||
|
[[
|
||
|
- __m256 vA, vB;
|
||
|
- _mm256_add_ps(vA, vB)
|
||
|
+ __m256i vA, vB, vC;
|
||
|
+ vC = _mm256_and_si256(vA, vB)
|
||
|
]])],
|
||
|
[op_avx2_support=1
|
||
|
AC_MSG_RESULT([yes])],
|
||
|
@@ -134,8 +134,8 @@ AC_DEFUN([MCA_ompi_op_avx_CONFIG],[
|
||
|
AC_LINK_IFELSE(
|
||
|
[AC_LANG_PROGRAM([[#include <immintrin.h>]],
|
||
|
[[
|
||
|
- __m256 vA, vB;
|
||
|
- _mm256_add_ps(vA, vB)
|
||
|
+ __m256i vA, vB, vC;
|
||
|
+ vC = _mm256_and_si256(vA, vB)
|
||
|
]])],
|
||
|
[op_avx2_support=1
|
||
|
MCA_BUILD_OP_AVX2_FLAGS="-mavx2"
|
||
|
@@ -164,21 +164,21 @@ AC_DEFUN([MCA_ompi_op_avx_CONFIG],[
|
||
|
CFLAGS="$op_avx_cflags_save"
|
||
|
])])
|
||
|
#
|
||
|
- # What about early AVX support. The rest of the logic is slightly different as
|
||
|
+ # What about early AVX support? The rest of the logic is slightly different as
|
||
|
# we need to include some of the SSE4.1 and SSE3 instructions. So, we first check
|
||
|
# if we can compile AVX code without a flag, then we validate that we have support
|
||
|
# for the SSE4.1 and SSE3 instructions we need. If not, we check for the usage of
|
||
|
# the AVX flag, and then recheck if we have support for the SSE4.1 and SSE3
|
||
|
# instructions.
|
||
|
#
|
||
|
- AC_CACHE_CHECK([if we are checking for AVX support], op_avx_check_avx, AS_VAR_SET(op_avx_check_avx, yes))
|
||
|
+ AC_CACHE_CHECK([for AVX support], op_avx_check_avx, AS_VAR_SET(op_avx_check_avx, yes))
|
||
|
AS_IF([test "$op_avx_check_avx" = "yes"],
|
||
|
[AC_MSG_CHECKING([for AVX support (no additional flags)])
|
||
|
AC_LINK_IFELSE(
|
||
|
[AC_LANG_PROGRAM([[#include <immintrin.h>]],
|
||
|
[[
|
||
|
- __m128 vA, vB;
|
||
|
- _mm_add_ps(vA, vB)
|
||
|
+ __m256 vA, vB, vC;
|
||
|
+ vC = _mm256_add_ps(vA, vB)
|
||
|
]])],
|
||
|
[op_avx_support=1
|
||
|
AC_MSG_RESULT([yes])],
|
||
|
@@ -186,7 +186,7 @@ AC_DEFUN([MCA_ompi_op_avx_CONFIG],[
|
||
|
#
|
||
|
# Check for SSE4.1 support
|
||
|
#
|
||
|
- AC_CACHE_CHECK([if we are checking for SSE4.1 support], op_avx_check_sse41, AS_VAR_SET(op_avx_check_sse41, yes))
|
||
|
+ AC_CACHE_CHECK([for SSE4.1 support], op_avx_check_sse41, AS_VAR_SET(op_avx_check_sse41, yes))
|
||
|
AS_IF([test $op_avx_support -eq 1 && test "$op_avx_check_sse41" = "yes"],
|
||
|
[AC_MSG_CHECKING([for SSE4.1 support])
|
||
|
AC_LINK_IFELSE(
|
||
|
@@ -202,7 +202,7 @@ AC_DEFUN([MCA_ompi_op_avx_CONFIG],[
|
||
|
#
|
||
|
# Check for SSE3 support
|
||
|
#
|
||
|
- AC_CACHE_CHECK([if we are checking for SSE3 support], op_avx_check_sse3, AS_VAR_SET(op_avx_check_sse3, yes))
|
||
|
+ AC_CACHE_CHECK([for SSE3 support], op_avx_check_sse3, AS_VAR_SET(op_avx_check_sse3, yes))
|
||
|
AS_IF([test $op_avx_support -eq 1 && test "$op_avx_check_sse3" = "yes"],
|
||
|
[AC_MSG_CHECKING([for SSE3 support])
|
||
|
AC_LINK_IFELSE(
|
||
|
@@ -224,8 +224,8 @@ AC_DEFUN([MCA_ompi_op_avx_CONFIG],[
|
||
|
AC_LINK_IFELSE(
|
||
|
[AC_LANG_PROGRAM([[#include <immintrin.h>]],
|
||
|
[[
|
||
|
- __m128 vA, vB;
|
||
|
- _mm_add_ps(vA, vB)
|
||
|
+ __m256 vA, vB, vC;
|
||
|
+ vC = _mm256_add_ps(vA, vB)
|
||
|
]])],
|
||
|
[op_avx_support=1
|
||
|
MCA_BUILD_OP_AVX_FLAGS="-mavx"
|
||
|
diff --git ompi/mca/op/avx/op_avx_functions.c ompi/mca/op/avx/op_avx_functions.c
|
||
|
index c79a8b34506..575ebf95d6a 100644
|
||
|
--- ompi/mca/op/avx/op_avx_functions.c
|
||
|
+++ ompi/mca/op/avx/op_avx_functions.c
|
||
|
@@ -1,5 +1,5 @@
|
||
|
/*
|
||
|
- * Copyright (c) 2019-2020 The University of Tennessee and The University
|
||
|
+ * Copyright (c) 2019-2021 The University of Tennessee and The University
|
||
|
* of Tennessee Research Foundation. All rights
|
||
|
* reserved.
|
||
|
* Copyright (c) 2020 Research Organization for Information Science
|
||
|
@@ -24,16 +24,42 @@
|
||
|
#include "ompi/mca/op/avx/op_avx.h"
|
||
|
|
||
|
#include <immintrin.h>
|
||
|
-
|
||
|
+/**
|
||
|
+ * The following logic is necessary to cope with distro maintainer's desire to change the compilation
|
||
|
+ * flags after the configure step, leading to inconsistencies between what OMPI has detected and what
|
||
|
+ * code can be generated during make. If we detect that the current code generation architecture has
|
||
|
+ * been changed from our own setting and cannot generate the code we need (AVX512, AVX2) we fall back
|
||
|
+ * to a lesser support (AVX512 -> AVX2, AVX2 -> AVX, AVX -> error out).
|
||
|
+ */
|
||
|
#if defined(GENERATE_AVX512_CODE)
|
||
|
-#define PREPEND _avx512
|
||
|
-#elif defined(GENERATE_AVX2_CODE)
|
||
|
-#define PREPEND _avx2
|
||
|
-#elif defined(GENERATE_AVX_CODE)
|
||
|
-#define PREPEND _avx
|
||
|
-#else
|
||
|
-#error This file should not be compiled in this conditions
|
||
|
-#endif
|
||
|
+# if defined(__AVX512BW__) && defined(__AVX512F__) && defined(__AVX512VL__)
|
||
|
+# define PREPEND _avx512
|
||
|
+# else
|
||
|
+# undef GENERATE_AVX512_CODE
|
||
|
+# endif /* defined(__AVX512BW__) && defined(__AVX512F__) && defined(__AVX512VL__) */
|
||
|
+#endif /* defined(GENERATE_AVX512_CODE) */
|
||
|
+
|
||
|
+#if !defined(PREPEND) && defined(GENERATE_AVX2_CODE)
|
||
|
+# if defined(__AVX2__)
|
||
|
+# define PREPEND _avx2
|
||
|
+# else
|
||
|
+# undef GENERATE_AVX2_CODE
|
||
|
+# endif /* defined(__AVX2__) */
|
||
|
+#endif /* !defined(PREPEND) && defined(GENERATE_AVX2_CODE) */
|
||
|
+
|
||
|
+#if !defined(PREPEND) && defined(GENERATE_AVX_CODE)
|
||
|
+# if defined(__AVX__)
|
||
|
+# define PREPEND _avx
|
||
|
+# endif
|
||
|
+#endif /* !defined(PREPEND) && defined(GENERATE_AVX_CODE) */
|
||
|
+
|
||
|
+#if !defined(PREPEND)
|
||
|
+# if OMPI_MCA_OP_HAVE_AVX512 || OMPI_MCA_OP_HAVE_AVX2
|
||
|
+# error The configure step has detected possible support for AVX512 and/or AVX2 but the compiler flags during make are too restrictive. Please disable the AVX component by adding --enable-mca-no-build=op-avx to your configure step.
|
||
|
+# else
|
||
|
+# error This file should not be compiled in this conditions. Please provide the config.log file to the OMPI developers.
|
||
|
+# endif /* OMPI_MCA_OP_HAVE_AVX512 || OMPI_MCA_OP_HAVE_AVX2 */
|
||
|
+#endif /* !defined(PREPEND) */
|
||
|
|
||
|
/*
|
||
|
* Concatenate preprocessor tokens A and B without expanding macro definitions
|
||
|
@@ -46,6 +72,102 @@
|
||
|
*/
|
||
|
#define OP_CONCAT(A, B) OP_CONCAT_NX(A, B)
|
||
|
|
||
|
+/*
|
||
|
+ * grep -e "_mm[125][251][862]_.*(" avx512.c -o | sed 's/(//g' | sort | uniq
|
||
|
+ *
|
||
|
+ * https://software.intel.com/sites/landingpage/IntrinsicsGuide
|
||
|
+ *
|
||
|
+ * _mm_add_epi[8,16,32,64] SSE2
|
||
|
+ * _mm_add_pd SSE2
|
||
|
+ * _mm_add_ps SSE
|
||
|
+ * _mm_adds_epi[8,16] SSE2
|
||
|
+ * _mm_adds_epu[8,16] SSE2
|
||
|
+ * _mm_and_si128 SSE2
|
||
|
+ * _mm_lddqu_si128 SSE3
|
||
|
+ * _mm_loadu_pd SSE2
|
||
|
+ * _mm_loadu_ps SSE
|
||
|
+ * _mm_max_epi8 SSE4.1
|
||
|
+ * _mm_max_epi16 SSE2
|
||
|
+ * _mm_max_epi32 SSE4.1
|
||
|
+ * _mm_max_epi64 AVX512VL + AVX512F
|
||
|
+ * _mm_max_epu8 SSE2
|
||
|
+ * _mm_max_epu[16,32] SSE4.1
|
||
|
+ * _mm_max_epu64 AVX512VL + AVX512F
|
||
|
+ * _mm_max_pd SSE2
|
||
|
+ * _mm_max_ps SSE
|
||
|
+ * _mm_min_epi8 SSE4.1
|
||
|
+ * _mm_min_epi16 SSE2
|
||
|
+ * _mm_min_epi32 SSE4.1
|
||
|
+ * _mm_min_epi64 AVX512VL + AVX512F
|
||
|
+ * _mm_min_epu8 SSE2
|
||
|
+ * _mm_min_epu[16,32] SSE4.1
|
||
|
+ * _mm_min_epu64 AVX512VL + AVX512F
|
||
|
+ * _mm_min_pd SSE2
|
||
|
+ * _mm_min_ps SSE
|
||
|
+ * _mm_mul_pd SSE2
|
||
|
+ * _mm_mul_ps SSE
|
||
|
+ * _mm_mullo_epi16 SSE2
|
||
|
+ * _mm_mullo_epi32 SSE4.1
|
||
|
+ * _mm_mullo_epi64 AVX512VL + AVX512DQ
|
||
|
+ * _mm_or_si128 SSE2
|
||
|
+ * _mm_storeu_pd SSE2
|
||
|
+ * _mm_storeu_ps SSE
|
||
|
+ * _mm_storeu_si128 SSE2
|
||
|
+ * _mm_xor_si128 SSE2
|
||
|
+ * _mm256_add_epi[8,16,32,64] AVX2
|
||
|
+ * _mm256_add_p[s,d] AVX
|
||
|
+ * _mm256_adds_epi[8,16] AVX2
|
||
|
+ * _mm256_adds_epu[8,16] AVX2
|
||
|
+ * _mm256_and_si256 AVX2
|
||
|
+ * _mm256_loadu_p[s,d] AVX
|
||
|
+ * _mm256_loadu_si256 AVX
|
||
|
+ * _mm256_max_epi[8,16,32] AVX2
|
||
|
+ * _mm256_max_epi64 AVX512VL + AVX512F
|
||
|
+ * _mm256_max_epu[8,16,32] AVX2
|
||
|
+ * _mm256_max_epu64 AVX512VL + AVX512F
|
||
|
+ * _mm256_max_p[s,d] AVX
|
||
|
+ * _mm256_min_epi[8,16,32] AVX2
|
||
|
+ * _mm256_min_epi64 AVX512VL + AVX512F
|
||
|
+ * _mm256_min_epu[8,16,32] AVX2
|
||
|
+ * _mm256_min_epu64 AVX512VL + AVX512F
|
||
|
+ * _mm256_min_p[s,d] AVX
|
||
|
+ * _mm256_mul_p[s,d] AVX
|
||
|
+ * _mm256_mullo_epi[16,32] AVX2
|
||
|
+ * _mm256_mullo_epi64 AVX512VL + AVX512DQ
|
||
|
+ * _mm256_or_si256 AVX2
|
||
|
+ * _mm256_storeu_p[s,d] AVX
|
||
|
+ * _mm256_storeu_si256 AVX
|
||
|
+ * _mm256_xor_si256 AVX2
|
||
|
+ * _mm512_add_epi[8,16] AVX512BW
|
||
|
+ * _mm512_add_epi[32,64] AVX512F
|
||
|
+ * _mm512_add_p[s,d] AVX512F
|
||
|
+ * _mm512_adds_epi[8,16] AVX512BW
|
||
|
+ * _mm512_adds_epu[8,16] AVX512BW
|
||
|
+ * _mm512_and_si512 AVX512F
|
||
|
+ * _mm512_cvtepi16_epi8 AVX512BW
|
||
|
+ * _mm512_cvtepi8_epi16 AVX512BW
|
||
|
+ * _mm512_loadu_p[s,d] AVX512F
|
||
|
+ * _mm512_loadu_si512 AVX512F
|
||
|
+ * _mm512_max_epi[8,16] AVX512BW
|
||
|
+ * _mm512_max_epi[32,64] AVX512F
|
||
|
+ * _mm512_max_epu[8,16] AVX512BW
|
||
|
+ * _mm512_max_epu[32,64] AVX512F
|
||
|
+ * _mm512_max_p[s,d] AVX512F
|
||
|
+ * _mm512_min_epi[8,16] AVX512BW
|
||
|
+ * _mm512_min_epi[32,64] AVX512F
|
||
|
+ * _mm512_min_epu[8,16] AVX512BW
|
||
|
+ * _mm512_min_epu[32,64] AVX512F
|
||
|
+ * _mm512_min_p[s,d] AVX512F
|
||
|
+ * _mm512_mul_p[s,d] AVX512F
|
||
|
+ * _mm512_mullo_epi16 AVX512BW
|
||
|
+ * _mm512_mullo_epi32 AVX512F
|
||
|
+ * _mm512_mullo_epi64 AVX512DQ
|
||
|
+ * _mm512_or_si512 AVX512F
|
||
|
+ * _mm512_storeu_p[s,d] AVX512F
|
||
|
+ * _mm512_storeu_si512 AVX512F
|
||
|
+ * _mm512_xor_si512 AVX512F
|
||
|
+ */
|
||
|
+
|
||
|
/*
|
||
|
* Since all the functions in this file are essentially identical, we
|
||
|
* use a macro to substitute in names and types. The core operation
|
||
|
@@ -62,13 +184,14 @@
|
||
|
(((_flag) & mca_op_avx_component.flags) == (_flag))
|
||
|
|
||
|
#if defined(GENERATE_AVX512_CODE) && defined(OMPI_MCA_OP_HAVE_AVX512) && (1 == OMPI_MCA_OP_HAVE_AVX512)
|
||
|
+#if __AVX512F__
|
||
|
#define OP_AVX_AVX512_FUNC(name, type_sign, type_size, type, op) \
|
||
|
if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_AVX512F_FLAG|OMPI_OP_AVX_HAS_AVX512BW_FLAG) ) { \
|
||
|
int types_per_step = (512 / 8) / sizeof(type); \
|
||
|
for( ; left_over >= types_per_step; left_over -= types_per_step ) { \
|
||
|
- __m512i vecA = _mm512_loadu_si512((__m512*)in); \
|
||
|
+ __m512i vecA = _mm512_loadu_si512((__m512*)in); \
|
||
|
in += types_per_step; \
|
||
|
- __m512i vecB = _mm512_loadu_si512((__m512*)out); \
|
||
|
+ __m512i vecB = _mm512_loadu_si512((__m512*)out); \
|
||
|
__m512i res = _mm512_##op##_ep##type_sign##type_size(vecA, vecB); \
|
||
|
_mm512_storeu_si512((__m512*)out, res); \
|
||
|
out += types_per_step; \
|
||
|
@@ -76,10 +199,14 @@
|
||
|
if( 0 == left_over ) return; \
|
||
|
}
|
||
|
#else
|
||
|
+#error Target architecture lacks AVX512F support needed for _mm512_loadu_si512 and _mm512_storeu_si512
|
||
|
+#endif /* __AVX512F__ */
|
||
|
+#else
|
||
|
#define OP_AVX_AVX512_FUNC(name, type_sign, type_size, type, op) {}
|
||
|
#endif /* defined(OMPI_MCA_OP_HAVE_AVX512) && (1 == OMPI_MCA_OP_HAVE_AVX512) */
|
||
|
|
||
|
#if defined(GENERATE_AVX2_CODE) && defined(OMPI_MCA_OP_HAVE_AVX2) && (1 == OMPI_MCA_OP_HAVE_AVX2)
|
||
|
+#if __AVX__
|
||
|
#define OP_AVX_AVX2_FUNC(name, type_sign, type_size, type, op) \
|
||
|
if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_AVX2_FLAG | OMPI_OP_AVX_HAS_AVX_FLAG) ) { \
|
||
|
int types_per_step = (256 / 8) / sizeof(type); /* AVX2 */ \
|
||
|
@@ -87,30 +214,37 @@
|
||
|
__m256i vecA = _mm256_loadu_si256((__m256i*)in); \
|
||
|
in += types_per_step; \
|
||
|
__m256i vecB = _mm256_loadu_si256((__m256i*)out); \
|
||
|
- __m256i res = _mm256_##op##_ep##type_sign##type_size(vecA, vecB); \
|
||
|
+ __m256i res = _mm256_##op##_ep##type_sign##type_size(vecA, vecB); \
|
||
|
_mm256_storeu_si256((__m256i*)out, res); \
|
||
|
out += types_per_step; \
|
||
|
} \
|
||
|
if( 0 == left_over ) return; \
|
||
|
}
|
||
|
#else
|
||
|
+#error Target architecture lacks AVX support needed for _mm256_loadu_si256 and _mm256_storeu_si256
|
||
|
+#endif /* __AVX__ */
|
||
|
+#else
|
||
|
#define OP_AVX_AVX2_FUNC(name, type_sign, type_size, type, op) {}
|
||
|
#endif /* defined(OMPI_MCA_OP_HAVE_AVX2) && (1 == OMPI_MCA_OP_HAVE_AVX2) */
|
||
|
|
||
|
#if defined(GENERATE_SSE3_CODE) && defined(OMPI_MCA_OP_HAVE_AVX) && (1 == OMPI_MCA_OP_HAVE_AVX)
|
||
|
+#if __SSE3__
|
||
|
#define OP_AVX_SSE4_1_FUNC(name, type_sign, type_size, type, op) \
|
||
|
if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_SSE3_FLAG | OMPI_OP_AVX_HAS_SSE4_1_FLAG) ) { \
|
||
|
- int types_per_step = (128 / 8) / sizeof(type); /* AVX */ \
|
||
|
+ int types_per_step = (128 / 8) / sizeof(type); \
|
||
|
for( ; left_over >= types_per_step; left_over -= types_per_step ) { \
|
||
|
__m128i vecA = _mm_lddqu_si128((__m128i*)in); \
|
||
|
in += types_per_step; \
|
||
|
__m128i vecB = _mm_lddqu_si128((__m128i*)out); \
|
||
|
- __m128i res = _mm_##op##_ep##type_sign##type_size(vecA, vecB); \
|
||
|
+ __m128i res = _mm_##op##_ep##type_sign##type_size(vecA, vecB); \
|
||
|
_mm_storeu_si128((__m128i*)out, res); \
|
||
|
out += types_per_step; \
|
||
|
} \
|
||
|
}
|
||
|
#else
|
||
|
+#error Target architecture lacks SSE3 support needed for _mm_lddqu_si128 and _mm_storeu_si128
|
||
|
+#endif /* __SSE3__ */
|
||
|
+#else
|
||
|
#define OP_AVX_SSE4_1_FUNC(name, type_sign, type_size, type, op) {}
|
||
|
#endif /* defined(OMPI_MCA_OP_HAVE_AVX) && (1 == OMPI_MCA_OP_HAVE_AVX) */
|
||
|
|
||
|
@@ -143,12 +277,13 @@ static void OP_CONCAT(ompi_op_avx_2buff_##name##_##type,PREPEND)(const void *_in
|
||
|
}
|
||
|
|
||
|
#if defined(GENERATE_AVX512_CODE) && defined(OMPI_MCA_OP_HAVE_AVX512) && (1 == OMPI_MCA_OP_HAVE_AVX512)
|
||
|
+#if __AVX512BW__ && __AVX__
|
||
|
#define OP_AVX_AVX512_MUL(name, type_sign, type_size, type, op) \
|
||
|
if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_AVX512F_FLAG | OMPI_OP_AVX_HAS_AVX512BW_FLAG) ) { \
|
||
|
int types_per_step = (256 / 8) / sizeof(type); \
|
||
|
for (; left_over >= types_per_step; left_over -= types_per_step) { \
|
||
|
- __m256i vecA_tmp = _mm256_loadu_si256((__m256i*)in); \
|
||
|
- __m256i vecB_tmp = _mm256_loadu_si256((__m256i*)out); \
|
||
|
+ __m256i vecA_tmp = _mm256_loadu_si256((__m256i*)in); \
|
||
|
+ __m256i vecB_tmp = _mm256_loadu_si256((__m256i*)out); \
|
||
|
in += types_per_step; \
|
||
|
__m512i vecA = _mm512_cvtepi8_epi16(vecA_tmp); \
|
||
|
__m512i vecB = _mm512_cvtepi8_epi16(vecB_tmp); \
|
||
|
@@ -160,6 +295,9 @@ static void OP_CONCAT(ompi_op_avx_2buff_##name##_##type,PREPEND)(const void *_in
|
||
|
if( 0 == left_over ) return; \
|
||
|
}
|
||
|
#else
|
||
|
+#error Target architecture lacks AVX512BW and AVX support needed for _mm256_loadu_si256, _mm256_storeu_si256 and _mm512_cvtepi8_epi16
|
||
|
+#endif /* __AVX512BW__ && __AVX__ */
|
||
|
+#else
|
||
|
#define OP_AVX_AVX512_MUL(name, type_sign, type_size, type, op) {}
|
||
|
#endif /* defined(OMPI_MCA_OP_HAVE_AVX512) && (1 == OMPI_MCA_OP_HAVE_AVX512) */
|
||
|
/**
|
||
|
@@ -201,13 +339,14 @@ static void OP_CONCAT( ompi_op_avx_2buff_##name##_##type, PREPEND)(const void *_
|
||
|
*
|
||
|
*/
|
||
|
#if defined(GENERATE_AVX512_CODE) && defined(OMPI_MCA_OP_HAVE_AVX512) && (1 == OMPI_MCA_OP_HAVE_AVX512)
|
||
|
+#if __AVX512F__
|
||
|
#define OP_AVX_AVX512_BIT_FUNC(name, type_size, type, op) \
|
||
|
if( OMPI_OP_AVX_HAS_FLAGS( OMPI_OP_AVX_HAS_AVX512F_FLAG) ) { \
|
||
|
types_per_step = (512 / 8) / sizeof(type); \
|
||
|
for (; left_over >= types_per_step; left_over -= types_per_step) { \
|
||
|
- __m512i vecA = _mm512_loadu_si512((__m512i*)in); \
|
||
|
+ __m512i vecA = _mm512_loadu_si512((__m512i*)in); \
|
||
|
in += types_per_step; \
|
||
|
- __m512i vecB = _mm512_loadu_si512((__m512i*)out); \
|
||
|
+ __m512i vecB = _mm512_loadu_si512((__m512i*)out); \
|
||
|
__m512i res = _mm512_##op##_si512(vecA, vecB); \
|
||
|
_mm512_storeu_si512((__m512i*)out, res); \
|
||
|
out += types_per_step; \
|
||
|
@@ -215,10 +354,14 @@ static void OP_CONCAT( ompi_op_avx_2buff_##name##_##type, PREPEND)(const void *_
|
||
|
if( 0 == left_over ) return; \
|
||
|
}
|
||
|
#else
|
||
|
+#error Target architecture lacks AVX512F support needed for _mm512_loadu_si512 and _mm512_storeu_si512
|
||
|
+#endif /* __AVX512F__ */
|
||
|
+#else
|
||
|
#define OP_AVX_AVX512_BIT_FUNC(name, type_size, type, op) {}
|
||
|
#endif /* defined(OMPI_MCA_OP_HAVE_AVX512) && (1 == OMPI_MCA_OP_HAVE_AVX512) */
|
||
|
|
||
|
#if defined(GENERATE_AVX2_CODE) && defined(OMPI_MCA_OP_HAVE_AVX2) && (1 == OMPI_MCA_OP_HAVE_AVX2)
|
||
|
+#if __AVX__
|
||
|
#define OP_AVX_AVX2_BIT_FUNC(name, type_size, type, op) \
|
||
|
if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_AVX2_FLAG | OMPI_OP_AVX_HAS_AVX_FLAG) ) { \
|
||
|
types_per_step = (256 / 8) / sizeof(type); \
|
||
|
@@ -226,17 +369,21 @@ static void OP_CONCAT( ompi_op_avx_2buff_##name##_##type, PREPEND)(const void *_
|
||
|
__m256i vecA = _mm256_loadu_si256((__m256i*)in); \
|
||
|
in += types_per_step; \
|
||
|
__m256i vecB = _mm256_loadu_si256((__m256i*)out); \
|
||
|
- __m256i res = _mm256_##op##_si256(vecA, vecB); \
|
||
|
+ __m256i res = _mm256_##op##_si256(vecA, vecB); \
|
||
|
_mm256_storeu_si256((__m256i*)out, res); \
|
||
|
out += types_per_step; \
|
||
|
} \
|
||
|
if( 0 == left_over ) return; \
|
||
|
}
|
||
|
#else
|
||
|
+#error Target architecture lacks AVX support needed for _mm256_loadu_si256 and _mm256_storeu_si256
|
||
|
+#endif /* __AVX__ */
|
||
|
+#else
|
||
|
#define OP_AVX_AVX2_BIT_FUNC(name, type_size, type, op) {}
|
||
|
#endif /* defined(OMPI_MCA_OP_HAVE_AVX2) && (1 == OMPI_MCA_OP_HAVE_AVX2) */
|
||
|
|
||
|
#if defined(GENERATE_SSE3_CODE) && defined(OMPI_MCA_OP_HAVE_AVX) && (1 == OMPI_MCA_OP_HAVE_AVX)
|
||
|
+#if __SSE3__ && __SSE2__
|
||
|
#define OP_AVX_SSE3_BIT_FUNC(name, type_size, type, op) \
|
||
|
if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_SSE3_FLAG) ) { \
|
||
|
types_per_step = (128 / 8) / sizeof(type); \
|
||
|
@@ -244,12 +391,15 @@ static void OP_CONCAT( ompi_op_avx_2buff_##name##_##type, PREPEND)(const void *_
|
||
|
__m128i vecA = _mm_lddqu_si128((__m128i*)in); \
|
||
|
in += types_per_step; \
|
||
|
__m128i vecB = _mm_lddqu_si128((__m128i*)out); \
|
||
|
- __m128i res = _mm_##op##_si128(vecA, vecB); \
|
||
|
+ __m128i res = _mm_##op##_si128(vecA, vecB); \
|
||
|
_mm_storeu_si128((__m128i*)out, res); \
|
||
|
out += types_per_step; \
|
||
|
} \
|
||
|
}
|
||
|
#else
|
||
|
+#error Target architecture lacks SSE2 and SSE3 support needed for _mm_lddqu_si128 and _mm_storeu_si128
|
||
|
+#endif /* __SSE3__ && __SSE2__ */
|
||
|
+#else
|
||
|
#define OP_AVX_SSE3_BIT_FUNC(name, type_size, type, op) {}
|
||
|
#endif /* defined(OMPI_MCA_OP_HAVE_AVX) && (1 == OMPI_MCA_OP_HAVE_AVX) */
|
||
|
|
||
|
@@ -282,12 +432,13 @@ static void OP_CONCAT(ompi_op_avx_2buff_##name##_##type,PREPEND)(const void *_in
|
||
|
}
|
||
|
|
||
|
#if defined(GENERATE_AVX512_CODE) && defined(OMPI_MCA_OP_HAVE_AVX512) && (1 == OMPI_MCA_OP_HAVE_AVX512)
|
||
|
+#if __AVX512F__
|
||
|
#define OP_AVX_AVX512_FLOAT_FUNC(op) \
|
||
|
if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_AVX512F_FLAG) ) { \
|
||
|
types_per_step = (512 / 8) / sizeof(float); \
|
||
|
for (; left_over >= types_per_step; left_over -= types_per_step) { \
|
||
|
- __m512 vecA = _mm512_loadu_ps((__m512*)in); \
|
||
|
- __m512 vecB = _mm512_loadu_ps((__m512*)out); \
|
||
|
+ __m512 vecA = _mm512_loadu_ps((__m512*)in); \
|
||
|
+ __m512 vecB = _mm512_loadu_ps((__m512*)out); \
|
||
|
in += types_per_step; \
|
||
|
__m512 res = _mm512_##op##_ps(vecA, vecB); \
|
||
|
_mm512_storeu_ps((__m512*)out, res); \
|
||
|
@@ -296,28 +447,36 @@ static void OP_CONCAT(ompi_op_avx_2buff_##name##_##type,PREPEND)(const void *_in
|
||
|
if( 0 == left_over ) return; \
|
||
|
}
|
||
|
#else
|
||
|
+#error Target architecture lacks AVX512F support needed for _mm512_loadu_ps and _mm512_storeu_ps
|
||
|
+#endif /* __AVX512F__ */
|
||
|
+#else
|
||
|
#define OP_AVX_AVX512_FLOAT_FUNC(op) {}
|
||
|
#endif /* defined(OMPI_MCA_OP_HAVE_AVX512) && (1 == OMPI_MCA_OP_HAVE_AVX512) */
|
||
|
|
||
|
#if defined(GENERATE_AVX2_CODE) && defined(OMPI_MCA_OP_HAVE_AVX2) && (1 == OMPI_MCA_OP_HAVE_AVX2)
|
||
|
+#if __AVX__
|
||
|
#define OP_AVX_AVX_FLOAT_FUNC(op) \
|
||
|
if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_AVX_FLAG) ) { \
|
||
|
types_per_step = (256 / 8) / sizeof(float); \
|
||
|
for( ; left_over >= types_per_step; left_over -= types_per_step ) { \
|
||
|
- __m256 vecA = _mm256_loadu_ps(in); \
|
||
|
+ __m256 vecA = _mm256_loadu_ps(in); \
|
||
|
in += types_per_step; \
|
||
|
- __m256 vecB = _mm256_loadu_ps(out); \
|
||
|
+ __m256 vecB = _mm256_loadu_ps(out); \
|
||
|
__m256 res = _mm256_##op##_ps(vecA, vecB); \
|
||
|
- _mm256_storeu_ps(out, res); \
|
||
|
+ _mm256_storeu_ps(out, res); \
|
||
|
out += types_per_step; \
|
||
|
} \
|
||
|
if( 0 == left_over ) return; \
|
||
|
}
|
||
|
#else
|
||
|
+#error Target architecture lacks AVX support needed for _mm256_loadu_ps and _mm256_storeu_ps
|
||
|
+#endif /* __AVX__ */
|
||
|
+#else
|
||
|
#define OP_AVX_AVX_FLOAT_FUNC(op) {}
|
||
|
#endif /* defined(OMPI_MCA_OP_HAVE_AVX2) && (1 == OMPI_MCA_OP_HAVE_AVX2) */
|
||
|
|
||
|
#if defined(GENERATE_AVX_CODE) && defined(OMPI_MCA_OP_HAVE_AVX) && (1 == OMPI_MCA_OP_HAVE_AVX)
|
||
|
+#if __SSE__
|
||
|
#define OP_AVX_SSE_FLOAT_FUNC(op) \
|
||
|
if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_SSE_FLAG) ) { \
|
||
|
types_per_step = (128 / 8) / sizeof(float); \
|
||
|
@@ -331,6 +490,9 @@ static void OP_CONCAT(ompi_op_avx_2buff_##name##_##type,PREPEND)(const void *_in
|
||
|
} \
|
||
|
}
|
||
|
#else
|
||
|
+#error Target architecture lacks SSE support needed for _mm_loadu_ps and _mm_storeu_ps
|
||
|
+#endif /* __SSE__ */
|
||
|
+#else
|
||
|
#define OP_AVX_SSE_FLOAT_FUNC(op) {}
|
||
|
#endif /* defined(OMPI_MCA_OP_HAVE_AVX) && (1 == OMPI_MCA_OP_HAVE_AVX) */
|
||
|
|
||
|
@@ -363,13 +525,14 @@ static void OP_CONCAT(ompi_op_avx_2buff_##op##_float,PREPEND)(const void *_in, v
|
||
|
}
|
||
|
|
||
|
#if defined(GENERATE_AVX512_CODE) && defined(OMPI_MCA_OP_HAVE_AVX512) && (1 == OMPI_MCA_OP_HAVE_AVX512)
|
||
|
+#if __AVX512F__
|
||
|
#define OP_AVX_AVX512_DOUBLE_FUNC(op) \
|
||
|
if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_AVX512F_FLAG) ) { \
|
||
|
types_per_step = (512 / 8) / sizeof(double); \
|
||
|
for (; left_over >= types_per_step; left_over -= types_per_step) { \
|
||
|
- __m512d vecA = _mm512_loadu_pd(in); \
|
||
|
+ __m512d vecA = _mm512_loadu_pd(in); \
|
||
|
in += types_per_step; \
|
||
|
- __m512d vecB = _mm512_loadu_pd(out); \
|
||
|
+ __m512d vecB = _mm512_loadu_pd(out); \
|
||
|
__m512d res = _mm512_##op##_pd(vecA, vecB); \
|
||
|
_mm512_storeu_pd((out), res); \
|
||
|
out += types_per_step; \
|
||
|
@@ -377,17 +540,21 @@ static void OP_CONCAT(ompi_op_avx_2buff_##op##_float,PREPEND)(const void *_in, v
|
||
|
if( 0 == left_over ) return; \
|
||
|
}
|
||
|
#else
|
||
|
+#error Target architecture lacks AVXF512 support needed for _mm512_loadu_pd and _mm512_storeu_pd
|
||
|
+#endif /* __AVXF512__ */
|
||
|
+#else
|
||
|
#define OP_AVX_AVX512_DOUBLE_FUNC(op) {}
|
||
|
#endif /* defined(OMPI_MCA_OP_HAVE_AVX512) && (1 == OMPI_MCA_OP_HAVE_AVX512) */
|
||
|
|
||
|
#if defined(GENERATE_AVX2_CODE) && defined(OMPI_MCA_OP_HAVE_AVX2) && (1 == OMPI_MCA_OP_HAVE_AVX2)
|
||
|
+#if __AVX__
|
||
|
#define OP_AVX_AVX_DOUBLE_FUNC(op) \
|
||
|
if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_AVX_FLAG) ) { \
|
||
|
types_per_step = (256 / 8) / sizeof(double); \
|
||
|
for( ; left_over >= types_per_step; left_over -= types_per_step ) { \
|
||
|
- __m256d vecA = _mm256_loadu_pd(in); \
|
||
|
+ __m256d vecA = _mm256_loadu_pd(in); \
|
||
|
in += types_per_step; \
|
||
|
- __m256d vecB = _mm256_loadu_pd(out); \
|
||
|
+ __m256d vecB = _mm256_loadu_pd(out); \
|
||
|
__m256d res = _mm256_##op##_pd(vecA, vecB); \
|
||
|
_mm256_storeu_pd(out, res); \
|
||
|
out += types_per_step; \
|
||
|
@@ -395,10 +562,14 @@ static void OP_CONCAT(ompi_op_avx_2buff_##op##_float,PREPEND)(const void *_in, v
|
||
|
if( 0 == left_over ) return; \
|
||
|
}
|
||
|
#else
|
||
|
+#error Target architecture lacks AVX support needed for _mm256_loadu_pd and _mm256_storeu_pd
|
||
|
+#endif /* __AVX__ */
|
||
|
+#else
|
||
|
#define OP_AVX_AVX_DOUBLE_FUNC(op) {}
|
||
|
#endif /* defined(OMPI_MCA_OP_HAVE_AVX2) && (1 == OMPI_MCA_OP_HAVE_AVX2) */
|
||
|
|
||
|
#if defined(GENERATE_AVX_CODE) && defined(OMPI_MCA_OP_HAVE_AVX) && (1 == OMPI_MCA_OP_HAVE_AVX)
|
||
|
+#if __SSE2__
|
||
|
#define OP_AVX_SSE2_DOUBLE_FUNC(op) \
|
||
|
if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_SSE2_FLAG) ) { \
|
||
|
types_per_step = (128 / 8) / sizeof(double); \
|
||
|
@@ -412,6 +583,9 @@ static void OP_CONCAT(ompi_op_avx_2buff_##op##_float,PREPEND)(const void *_in, v
|
||
|
} \
|
||
|
}
|
||
|
#else
|
||
|
+#error Target architecture lacks SSE2 support needed for _mm_loadu_pd and _mm_storeu_pd
|
||
|
+#endif /* __SSE2__ */
|
||
|
+#else
|
||
|
#define OP_AVX_SSE2_DOUBLE_FUNC(op) {}
|
||
|
#endif /* defined(OMPI_MCA_OP_HAVE_AVX) && (1 == OMPI_MCA_OP_HAVE_AVX) */
|
||
|
|
||
|
@@ -580,12 +754,13 @@ static void OP_CONCAT(ompi_op_avx_2buff_##op##_double,PREPEND)(const void *_in,
|
||
|
* routines, needed for some optimizations.
|
||
|
*/
|
||
|
#if defined(GENERATE_AVX512_CODE) && defined(OMPI_MCA_OP_HAVE_AVX512) && (1 == OMPI_MCA_OP_HAVE_AVX512)
|
||
|
+#if __AVX512F__
|
||
|
#define OP_AVX_AVX512_FUNC_3(name, type_sign, type_size, type, op) \
|
||
|
if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_AVX512F_FLAG|OMPI_OP_AVX_HAS_AVX512BW_FLAG) ) { \
|
||
|
int types_per_step = (512 / 8) / sizeof(type); \
|
||
|
for (; left_over >= types_per_step; left_over -= types_per_step) { \
|
||
|
- __m512i vecA = _mm512_loadu_si512(in1); \
|
||
|
- __m512i vecB = _mm512_loadu_si512(in2); \
|
||
|
+ __m512i vecA = _mm512_loadu_si512(in1); \
|
||
|
+ __m512i vecB = _mm512_loadu_si512(in2); \
|
||
|
in1 += types_per_step; \
|
||
|
in2 += types_per_step; \
|
||
|
__m512i res = _mm512_##op##_ep##type_sign##type_size(vecA, vecB); \
|
||
|
@@ -595,10 +770,14 @@ static void OP_CONCAT(ompi_op_avx_2buff_##op##_double,PREPEND)(const void *_in,
|
||
|
if( 0 == left_over ) return; \
|
||
|
}
|
||
|
#else
|
||
|
+#error Target architecture lacks AVX512F support needed for _mm512_loadu_si512 and _mm512_storeu_si512
|
||
|
+#endif /* __AVX512F__ */
|
||
|
+#else
|
||
|
#define OP_AVX_AVX512_FUNC_3(name, type_sign, type_size, type, op) {}
|
||
|
#endif /* defined(OMPI_MCA_OP_HAVE_AVX512) && (1 == OMPI_MCA_OP_HAVE_AVX512) */
|
||
|
|
||
|
#if defined(GENERATE_AVX2_CODE) && defined(OMPI_MCA_OP_HAVE_AVX2) && (1 == OMPI_MCA_OP_HAVE_AVX2)
|
||
|
+#if __AVX__
|
||
|
#define OP_AVX_AVX2_FUNC_3(name, type_sign, type_size, type, op) \
|
||
|
if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_AVX2_FLAG | OMPI_OP_AVX_HAS_AVX_FLAG) ) { \
|
||
|
int types_per_step = (256 / 8) / sizeof(type); \
|
||
|
@@ -607,17 +786,21 @@ static void OP_CONCAT(ompi_op_avx_2buff_##op##_double,PREPEND)(const void *_in,
|
||
|
__m256i vecB = _mm256_loadu_si256((__m256i*)in2); \
|
||
|
in1 += types_per_step; \
|
||
|
in2 += types_per_step; \
|
||
|
- __m256i res = _mm256_##op##_ep##type_sign##type_size(vecA, vecB); \
|
||
|
+ __m256i res = _mm256_##op##_ep##type_sign##type_size(vecA, vecB); \
|
||
|
_mm256_storeu_si256((__m256i*)out, res); \
|
||
|
out += types_per_step; \
|
||
|
} \
|
||
|
if( 0 == left_over ) return; \
|
||
|
}
|
||
|
#else
|
||
|
+#error Target architecture lacks AVX support needed for _mm256_loadu_si256 and _mm256_storeu_si256
|
||
|
+#endif /* __AVX__ */
|
||
|
+#else
|
||
|
#define OP_AVX_AVX2_FUNC_3(name, type_sign, type_size, type, op) {}
|
||
|
#endif /* defined(OMPI_MCA_OP_HAVE_AVX2) && (1 == OMPI_MCA_OP_HAVE_AVX2) */
|
||
|
|
||
|
#if defined(GENERATE_SSE3_CODE) && defined(OMPI_MCA_OP_HAVE_SSE41) && (1 == OMPI_MCA_OP_HAVE_SSE41) && defined(OMPI_MCA_OP_HAVE_AVX) && (1 == OMPI_MCA_OP_HAVE_AVX)
|
||
|
+#if __SSE3__ && __SSE2__
|
||
|
#define OP_AVX_SSE4_1_FUNC_3(name, type_sign, type_size, type, op) \
|
||
|
if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_SSE3_FLAG | OMPI_OP_AVX_HAS_SSE4_1_FLAG) ) { \
|
||
|
int types_per_step = (128 / 8) / sizeof(type); \
|
||
|
@@ -626,12 +809,15 @@ static void OP_CONCAT(ompi_op_avx_2buff_##op##_double,PREPEND)(const void *_in,
|
||
|
__m128i vecB = _mm_lddqu_si128((__m128i*)in2); \
|
||
|
in1 += types_per_step; \
|
||
|
in2 += types_per_step; \
|
||
|
- __m128i res = _mm_##op##_ep##type_sign##type_size(vecA, vecB); \
|
||
|
+ __m128i res = _mm_##op##_ep##type_sign##type_size(vecA, vecB); \
|
||
|
_mm_storeu_si128((__m128i*)out, res); \
|
||
|
out += types_per_step; \
|
||
|
} \
|
||
|
}
|
||
|
#else
|
||
|
+#error Target architecture lacks SSE2 and SSE3 support needed for _mm_lddqu_si128 and _mm_storeu_si128
|
||
|
+#endif /* __SSE3__ && __SSE2__ */
|
||
|
+#else
|
||
|
#define OP_AVX_SSE4_1_FUNC_3(name, type_sign, type_size, type, op) {}
|
||
|
#endif /* defined(OMPI_MCA_OP_HAVE_AVX) && (1 == OMPI_MCA_OP_HAVE_AVX) */
|
||
|
|
||
|
@@ -667,12 +853,13 @@ static void OP_CONCAT(ompi_op_avx_3buff_##name##_##type,PREPEND)(const void * re
|
||
|
}
|
||
|
|
||
|
#if defined(GENERATE_AVX512_CODE) && defined(OMPI_MCA_OP_HAVE_AVX512) && (1 == OMPI_MCA_OP_HAVE_AVX512)
|
||
|
+#if __AVX512BW__ && __AVX__
|
||
|
#define OP_AVX_AVX512_MUL_3(name, type_sign, type_size, type, op) \
|
||
|
if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_AVX512F_FLAG | OMPI_OP_AVX_HAS_AVX512BW_FLAG) ) { \
|
||
|
int types_per_step = (256 / 8) / sizeof(type); \
|
||
|
for (; left_over >= types_per_step; left_over -= types_per_step) { \
|
||
|
- __m256i vecA_tmp = _mm256_loadu_si256((__m256i*)in1); \
|
||
|
- __m256i vecB_tmp = _mm256_loadu_si256((__m256i*)in2); \
|
||
|
+ __m256i vecA_tmp = _mm256_loadu_si256((__m256i*)in1); \
|
||
|
+ __m256i vecB_tmp = _mm256_loadu_si256((__m256i*)in2); \
|
||
|
in1 += types_per_step; \
|
||
|
in2 += types_per_step; \
|
||
|
__m512i vecA = _mm512_cvtepi8_epi16(vecA_tmp); \
|
||
|
@@ -685,6 +872,9 @@ static void OP_CONCAT(ompi_op_avx_3buff_##name##_##type,PREPEND)(const void * re
|
||
|
if( 0 == left_over ) return; \
|
||
|
}
|
||
|
#else
|
||
|
+#error Target architecture lacks AVX512BW and AVX support needed for _mm256_loadu_si256, _mm256_storeu_si256 and _mm512_cvtepi8_epi16
|
||
|
+#endif /* __AVX512BW__ && __AVX__ */
|
||
|
+#else
|
||
|
#define OP_AVX_AVX512_MUL_3(name, type_sign, type_size, type, op) {}
|
||
|
#endif /* defined(OMPI_MCA_OP_HAVE_AVX512) && (1 == OMPI_MCA_OP_HAVE_AVX512) */
|
||
|
/**
|
||
|
@@ -723,12 +913,13 @@ static void OP_CONCAT(ompi_op_avx_3buff_##name##_##type,PREPEND)(const void * re
|
||
|
}
|
||
|
|
||
|
#if defined(GENERATE_AVX512_CODE) && defined(OMPI_MCA_OP_HAVE_AVX512) && (1 == OMPI_MCA_OP_HAVE_AVX512)
|
||
|
+#if __AVX512F__
|
||
|
#define OP_AVX_AVX512_BIT_FUNC_3(name, type_size, type, op) \
|
||
|
if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_AVX512F_FLAG) ) { \
|
||
|
types_per_step = (512 / 8) / sizeof(type); \
|
||
|
for (; left_over >= types_per_step; left_over -= types_per_step) { \
|
||
|
- __m512i vecA = _mm512_loadu_si512(in1); \
|
||
|
- __m512i vecB = _mm512_loadu_si512(in2); \
|
||
|
+ __m512i vecA = _mm512_loadu_si512(in1); \
|
||
|
+ __m512i vecB = _mm512_loadu_si512(in2); \
|
||
|
in1 += types_per_step; \
|
||
|
in2 += types_per_step; \
|
||
|
__m512i res = _mm512_##op##_si512(vecA, vecB); \
|
||
|
@@ -738,10 +929,14 @@ static void OP_CONCAT(ompi_op_avx_3buff_##name##_##type,PREPEND)(const void * re
|
||
|
if( 0 == left_over ) return; \
|
||
|
}
|
||
|
#else
|
||
|
+#error Target architecture lacks AVX512F support needed for _mm512_loadu_si512 and _mm512_storeu_si512
|
||
|
+#endif /* __AVX512F__ */
|
||
|
+#else
|
||
|
#define OP_AVX_AVX512_BIT_FUNC_3(name, type_size, type, op) {}
|
||
|
#endif /* defined(OMPI_MCA_OP_HAVE_AVX512) && (1 == OMPI_MCA_OP_HAVE_AVX512) */
|
||
|
|
||
|
#if defined(GENERATE_AVX2_CODE) && defined(OMPI_MCA_OP_HAVE_AVX2) && (1 == OMPI_MCA_OP_HAVE_AVX2)
|
||
|
+#if __AVX__
|
||
|
#define OP_AVX_AVX2_BIT_FUNC_3(name, type_size, type, op) \
|
||
|
if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_AVX2_FLAG | OMPI_OP_AVX_HAS_AVX_FLAG) ) { \
|
||
|
types_per_step = (256 / 8) / sizeof(type); \
|
||
|
@@ -750,17 +945,21 @@ static void OP_CONCAT(ompi_op_avx_3buff_##name##_##type,PREPEND)(const void * re
|
||
|
__m256i vecB = _mm256_loadu_si256((__m256i*)in2); \
|
||
|
in1 += types_per_step; \
|
||
|
in2 += types_per_step; \
|
||
|
- __m256i res = _mm256_##op##_si256(vecA, vecB); \
|
||
|
+ __m256i res = _mm256_##op##_si256(vecA, vecB); \
|
||
|
_mm256_storeu_si256((__m256i*)out, res); \
|
||
|
out += types_per_step; \
|
||
|
} \
|
||
|
if( 0 == left_over ) return; \
|
||
|
}
|
||
|
#else
|
||
|
+#error Target architecture lacks AVX support needed for _mm256_loadu_si256 and _mm256_storeu_si256
|
||
|
+#endif /* __AVX__ */
|
||
|
+#else
|
||
|
#define OP_AVX_AVX2_BIT_FUNC_3(name, type_size, type, op) {}
|
||
|
#endif /* defined(OMPI_MCA_OP_HAVE_AVX2) && (1 == OMPI_MCA_OP_HAVE_AVX2) */
|
||
|
|
||
|
#if defined(GENERATE_SSE3_CODE) && defined(OMPI_MCA_OP_HAVE_AVX) && (1 == OMPI_MCA_OP_HAVE_AVX)
|
||
|
+#if __SSE3__ && __SSE2__
|
||
|
#define OP_AVX_SSE3_BIT_FUNC_3(name, type_size, type, op) \
|
||
|
if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_SSE3_FLAG) ) { \
|
||
|
types_per_step = (128 / 8) / sizeof(type); \
|
||
|
@@ -769,12 +968,15 @@ static void OP_CONCAT(ompi_op_avx_3buff_##name##_##type,PREPEND)(const void * re
|
||
|
__m128i vecB = _mm_lddqu_si128((__m128i*)in2); \
|
||
|
in1 += types_per_step; \
|
||
|
in2 += types_per_step; \
|
||
|
- __m128i res = _mm_##op##_si128(vecA, vecB); \
|
||
|
+ __m128i res = _mm_##op##_si128(vecA, vecB); \
|
||
|
_mm_storeu_si128((__m128i*)out, res); \
|
||
|
out += types_per_step; \
|
||
|
} \
|
||
|
}
|
||
|
#else
|
||
|
+#error Target architecture lacks SSE2 and SSE3 support needed for _mm_lddqu_si128 and _mm_storeu_si128
|
||
|
+#endif /* __SSE3__ && __SSE2__ */
|
||
|
+#else
|
||
|
#define OP_AVX_SSE3_BIT_FUNC_3(name, type_size, type, op) {}
|
||
|
#endif /* defined(OMPI_MCA_OP_HAVE_AVX) && (1 == OMPI_MCA_OP_HAVE_AVX) */
|
||
|
|
||
|
@@ -809,12 +1011,13 @@ static void OP_CONCAT(ompi_op_avx_3buff_##op##_##type,PREPEND)(const void *_in1,
|
||
|
}
|
||
|
|
||
|
#if defined(GENERATE_AVX512_CODE) && defined(OMPI_MCA_OP_HAVE_AVX512) && (1 == OMPI_MCA_OP_HAVE_AVX512)
|
||
|
+#if __AVX512F__
|
||
|
#define OP_AVX_AVX512_FLOAT_FUNC_3(op) \
|
||
|
if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_AVX512F_FLAG) ) { \
|
||
|
types_per_step = (512 / 8) / sizeof(float); \
|
||
|
for (; left_over >= types_per_step; left_over -= types_per_step) { \
|
||
|
- __m512 vecA = _mm512_loadu_ps(in1); \
|
||
|
- __m512 vecB = _mm512_loadu_ps(in2); \
|
||
|
+ __m512 vecA = _mm512_loadu_ps(in1); \
|
||
|
+ __m512 vecB = _mm512_loadu_ps(in2); \
|
||
|
in1 += types_per_step; \
|
||
|
in2 += types_per_step; \
|
||
|
__m512 res = _mm512_##op##_ps(vecA, vecB); \
|
||
|
@@ -824,16 +1027,20 @@ static void OP_CONCAT(ompi_op_avx_3buff_##op##_##type,PREPEND)(const void *_in1,
|
||
|
if( 0 == left_over ) return; \
|
||
|
}
|
||
|
#else
|
||
|
+#error Target architecture lacks AVX512F support needed for _mm512_loadu_ps and _mm512_storeu_ps
|
||
|
+#endif /* __AVX512F__ */
|
||
|
+#else
|
||
|
#define OP_AVX_AVX512_FLOAT_FUNC_3(op) {}
|
||
|
#endif /* defined(OMPI_MCA_OP_HAVE_AVX512) && (1 == OMPI_MCA_OP_HAVE_AVX512) */
|
||
|
|
||
|
#if defined(GENERATE_AVX2_CODE) && defined(OMPI_MCA_OP_HAVE_AVX2) && (1 == OMPI_MCA_OP_HAVE_AVX2)
|
||
|
+#if __AVX__
|
||
|
#define OP_AVX_AVX_FLOAT_FUNC_3(op) \
|
||
|
if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_AVX_FLAG) ) { \
|
||
|
types_per_step = (256 / 8) / sizeof(float); \
|
||
|
for( ; left_over >= types_per_step; left_over -= types_per_step ) { \
|
||
|
- __m256 vecA = _mm256_loadu_ps(in1); \
|
||
|
- __m256 vecB = _mm256_loadu_ps(in2); \
|
||
|
+ __m256 vecA = _mm256_loadu_ps(in1); \
|
||
|
+ __m256 vecB = _mm256_loadu_ps(in2); \
|
||
|
in1 += types_per_step; \
|
||
|
in2 += types_per_step; \
|
||
|
__m256 res = _mm256_##op##_ps(vecA, vecB); \
|
||
|
@@ -843,10 +1050,14 @@ static void OP_CONCAT(ompi_op_avx_3buff_##op##_##type,PREPEND)(const void *_in1,
|
||
|
if( 0 == left_over ) return; \
|
||
|
}
|
||
|
#else
|
||
|
+#error Target architecture lacks AVX support needed for _mm256_loadu_ps and _mm256_storeu_ps
|
||
|
+#endif /* __AVX__ */
|
||
|
+#else
|
||
|
#define OP_AVX_AVX_FLOAT_FUNC_3(op) {}
|
||
|
#endif /* defined(OMPI_MCA_OP_HAVE_AVX2) && (1 == OMPI_MCA_OP_HAVE_AVX2) */
|
||
|
|
||
|
#if defined(GENERATE_AVX_CODE) && defined(OMPI_MCA_OP_HAVE_AVX) && (1 == OMPI_MCA_OP_HAVE_AVX)
|
||
|
+#if __SSE__
|
||
|
#define OP_AVX_SSE_FLOAT_FUNC_3(op) \
|
||
|
if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_SSE_FLAG) ) { \
|
||
|
types_per_step = (128 / 8) / sizeof(float); \
|
||
|
@@ -861,6 +1072,9 @@ static void OP_CONCAT(ompi_op_avx_3buff_##op##_##type,PREPEND)(const void *_in1,
|
||
|
} \
|
||
|
}
|
||
|
#else
|
||
|
+#error Target architecture lacks SSE support needed for _mm_loadu_ps and _mm_storeu_ps
|
||
|
+#endif /* __SSE__ */
|
||
|
+#else
|
||
|
#define OP_AVX_SSE_FLOAT_FUNC_3(op) {}
|
||
|
#endif /* defined(OMPI_MCA_OP_HAVE_AVX) && (1 == OMPI_MCA_OP_HAVE_AVX) */
|
||
|
|
||
|
@@ -895,12 +1109,13 @@ static void OP_CONCAT(ompi_op_avx_3buff_##op##_float,PREPEND)(const void *_in1,
|
||
|
}
|
||
|
|
||
|
#if defined(GENERATE_AVX512_CODE) && defined(OMPI_MCA_OP_HAVE_AVX512) && (1 == OMPI_MCA_OP_HAVE_AVX512)
|
||
|
+#if __AVX512F__
|
||
|
#define OP_AVX_AVX512_DOUBLE_FUNC_3(op) \
|
||
|
if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_AVX512F_FLAG) ) { \
|
||
|
types_per_step = (512 / 8) / sizeof(double); \
|
||
|
for (; left_over >= types_per_step; left_over -= types_per_step) { \
|
||
|
- __m512d vecA = _mm512_loadu_pd((in1)); \
|
||
|
- __m512d vecB = _mm512_loadu_pd((in2)); \
|
||
|
+ __m512d vecA = _mm512_loadu_pd((in1)); \
|
||
|
+ __m512d vecB = _mm512_loadu_pd((in2)); \
|
||
|
in1 += types_per_step; \
|
||
|
in2 += types_per_step; \
|
||
|
__m512d res = _mm512_##op##_pd(vecA, vecB); \
|
||
|
@@ -910,16 +1125,20 @@ static void OP_CONCAT(ompi_op_avx_3buff_##op##_float,PREPEND)(const void *_in1,
|
||
|
if( 0 == left_over ) return; \
|
||
|
}
|
||
|
#else
|
||
|
+#error Target architecture lacks AVXF512 support needed for _mm512_loadu_pd and _mm512_storeu_pd
|
||
|
+#endif /* __AVXF512__ */
|
||
|
+#else
|
||
|
#define OP_AVX_AVX512_DOUBLE_FUNC_3(op) {}
|
||
|
#endif /* defined(OMPI_MCA_OP_HAVE_AVX512) && (1 == OMPI_MCA_OP_HAVE_AVX512) */
|
||
|
|
||
|
#if defined(GENERATE_AVX2_CODE) && defined(OMPI_MCA_OP_HAVE_AVX2) && (1 == OMPI_MCA_OP_HAVE_AVX2)
|
||
|
+#if __AVX__
|
||
|
#define OP_AVX_AVX_DOUBLE_FUNC_3(op) \
|
||
|
if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_AVX_FLAG) ) { \
|
||
|
types_per_step = (256 / 8) / sizeof(double); \
|
||
|
for( ; left_over >= types_per_step; left_over -= types_per_step ) { \
|
||
|
- __m256d vecA = _mm256_loadu_pd(in1); \
|
||
|
- __m256d vecB = _mm256_loadu_pd(in2); \
|
||
|
+ __m256d vecA = _mm256_loadu_pd(in1); \
|
||
|
+ __m256d vecB = _mm256_loadu_pd(in2); \
|
||
|
in1 += types_per_step; \
|
||
|
in2 += types_per_step; \
|
||
|
__m256d res = _mm256_##op##_pd(vecA, vecB); \
|
||
|
@@ -929,10 +1148,14 @@ static void OP_CONCAT(ompi_op_avx_3buff_##op##_float,PREPEND)(const void *_in1,
|
||
|
if( 0 == left_over ) return; \
|
||
|
}
|
||
|
#else
|
||
|
+#error Target architecture lacks AVX support needed for _mm256_loadu_pd and _mm256_storeu_pd
|
||
|
+#endif /* __AVX__ */
|
||
|
+#else
|
||
|
#define OP_AVX_AVX_DOUBLE_FUNC_3(op) {}
|
||
|
#endif /* defined(OMPI_MCA_OP_HAVE_AVX2) && (1 == OMPI_MCA_OP_HAVE_AVX2) */
|
||
|
|
||
|
#if defined(GENERATE_AVX_CODE) && defined(OMPI_MCA_OP_HAVE_AVX) && (1 == OMPI_MCA_OP_HAVE_AVX)
|
||
|
+#if __SSE2__
|
||
|
#define OP_AVX_SSE2_DOUBLE_FUNC_3(op) \
|
||
|
if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_SSE2_FLAG) ) { \
|
||
|
types_per_step = (128 / 8) / sizeof(double); \
|
||
|
@@ -947,6 +1170,9 @@ static void OP_CONCAT(ompi_op_avx_3buff_##op##_float,PREPEND)(const void *_in1,
|
||
|
} \
|
||
|
}
|
||
|
#else
|
||
|
+#error Target architecture lacks SSE2 support needed for _mm_loadu_pd and _mm_storeu_pd
|
||
|
+#endif /* __SSE2__ */
|
||
|
+#else
|
||
|
#define OP_AVX_SSE2_DOUBLE_FUNC_3(op) {}
|
||
|
#endif /* defined(OMPI_MCA_OP_HAVE_AVX) && (1 == OMPI_MCA_OP_HAVE_AVX) */
|
||
|
|
||
|
|
||
|
From ac6f658a4127bbfd916f26946330d9906dcbae54 Mon Sep 17 00:00:00 2001
|
||
|
From: George Bosilca <bosilca@icl.utk.edu>
|
||
|
Date: Tue, 5 Jan 2021 22:40:26 -0500
|
||
|
Subject: [PATCH 3/3] A better test for MPI_OP performance.
|
||
|
|
||
|
The test now has the ability to add a shift to all or to any of the
|
||
|
input and output buffers to assess the impact of unaligned operations.
|
||
|
|
||
|
Signed-off-by: George Bosilca <bosilca@icl.utk.edu>
|
||
|
---
|
||
|
test/datatype/reduce_local.c | 161 ++++++++++++++++++++++-------------
|
||
|
1 file changed, 104 insertions(+), 57 deletions(-)
|
||
|
|
||
|
diff --git test/datatype/reduce_local.c test/datatype/reduce_local.c
|
||
|
index 97890f94227..f227439b714 100644
|
||
|
--- test/datatype/reduce_local.c
|
||
|
+++ test/datatype/reduce_local.c
|
||
|
@@ -59,7 +59,7 @@ static int total_errors = 0;
|
||
|
_a < _b ? _a : _b; })
|
||
|
|
||
|
static void print_status(char* op, char* type, int type_size,
|
||
|
- int count, double duration,
|
||
|
+ int count, int max_shift, double *duration, int repeats,
|
||
|
int correct )
|
||
|
{
|
||
|
if(correct) {
|
||
|
@@ -68,7 +68,15 @@ static void print_status(char* op, char* type, int type_size,
|
||
|
printf("%-10s %s [\033[1;31mfail\033[0m]", op, type);
|
||
|
total_errors++;
|
||
|
}
|
||
|
- printf(" count %-10d time %.6f seconds\n", count, duration);
|
||
|
+ if( 1 == max_shift ) {
|
||
|
+ printf(" count %-10d time (seconds) %.8f seconds\n", count, duration[0] / repeats);
|
||
|
+ } else {
|
||
|
+ printf(" count %-10d time (seconds / shifts) ", count);
|
||
|
+ for( int i = 0; i < max_shift; i++ ) {
|
||
|
+ printf("%.8f ", duration[i] / repeats );
|
||
|
+ }
|
||
|
+ printf("\n");
|
||
|
+ }
|
||
|
}
|
||
|
|
||
|
static int do_ops_built = 0;
|
||
|
@@ -115,19 +123,23 @@ do { \
|
||
|
const TYPE *_p1 = ((TYPE*)(INBUF)), *_p3 = ((TYPE*)(CHECK_BUF)); \
|
||
|
TYPE *_p2 = ((TYPE*)(INOUT_BUF)); \
|
||
|
skip_op_type = 0; \
|
||
|
- for(int _k = 0; _k < min((COUNT), 4); +_k++ ) { \
|
||
|
- memcpy(_p2, _p3, sizeof(TYPE) * (COUNT)); \
|
||
|
- tstart = MPI_Wtime(); \
|
||
|
- MPI_Reduce_local(_p1+_k, _p2+_k, (COUNT)-_k, (MPITYPE), (MPIOP)); \
|
||
|
- tend = MPI_Wtime(); \
|
||
|
- if( check ) { \
|
||
|
- for( i = 0; i < (COUNT)-_k; i++ ) { \
|
||
|
- if(((_p2+_k)[i]) == (((_p1+_k)[i]) OPNAME ((_p3+_k)[i]))) \
|
||
|
- continue; \
|
||
|
- printf("First error at alignment %d position %d (%" TYPE_PREFIX " %s %" TYPE_PREFIX " != %" TYPE_PREFIX ")\n", \
|
||
|
- _k, i, (_p1+_k)[i], (#OPNAME), (_p3+_k)[i], (_p2+_k)[i]); \
|
||
|
- correctness = 0; \
|
||
|
- break; \
|
||
|
+ for(int _k = 0; _k < min((COUNT), max_shift); +_k++ ) { \
|
||
|
+ duration[_k] = 0.0; \
|
||
|
+ for(int _r = repeats; _r > 0; _r--) { \
|
||
|
+ memcpy(_p2, _p3, sizeof(TYPE) * (COUNT)); \
|
||
|
+ tstart = MPI_Wtime(); \
|
||
|
+ MPI_Reduce_local(_p1+_k, _p2+_k, (COUNT)-_k, (MPITYPE), (MPIOP)); \
|
||
|
+ tend = MPI_Wtime(); \
|
||
|
+ duration[_k] += (tend - tstart); \
|
||
|
+ if( check ) { \
|
||
|
+ for( i = 0; i < (COUNT)-_k; i++ ) { \
|
||
|
+ if(((_p2+_k)[i]) == (((_p1+_k)[i]) OPNAME ((_p3+_k)[i]))) \
|
||
|
+ continue; \
|
||
|
+ printf("First error at alignment %d position %d (%" TYPE_PREFIX " %s %" TYPE_PREFIX " != %" TYPE_PREFIX ")\n", \
|
||
|
+ _k, i, (_p1+_k)[i], (#OPNAME), (_p3+_k)[i], (_p2+_k)[i]); \
|
||
|
+ correctness = 0; \
|
||
|
+ break; \
|
||
|
+ } \
|
||
|
} \
|
||
|
} \
|
||
|
} \
|
||
|
@@ -139,20 +151,24 @@ do { \
|
||
|
const TYPE *_p1 = ((TYPE*)(INBUF)), *_p3 = ((TYPE*)(CHECK_BUF)); \
|
||
|
TYPE *_p2 = ((TYPE*)(INOUT_BUF)); \
|
||
|
skip_op_type = 0; \
|
||
|
- for(int _k = 0; _k < min((COUNT), 4); +_k++ ) { \
|
||
|
- memcpy(_p2, _p3, sizeof(TYPE) * (COUNT)); \
|
||
|
- tstart = MPI_Wtime(); \
|
||
|
- MPI_Reduce_local(_p1+_k, _p2+_k, (COUNT), (MPITYPE), (MPIOP)); \
|
||
|
- tend = MPI_Wtime(); \
|
||
|
- if( check ) { \
|
||
|
- for( i = 0; i < (COUNT); i++ ) { \
|
||
|
- TYPE _v1 = *(_p1+_k), _v2 = *(_p2+_k), _v3 = *(_p3+_k); \
|
||
|
- if(_v2 == OPNAME(_v1, _v3)) \
|
||
|
- continue; \
|
||
|
- printf("First error at alignment %d position %d (%" TYPE_PREFIX " != %s(%" TYPE_PREFIX ", %" TYPE_PREFIX ")\n", \
|
||
|
- _k, i, _v1, (#OPNAME), _v3, _v2); \
|
||
|
- correctness = 0; \
|
||
|
- break; \
|
||
|
+ for(int _k = 0; _k < min((COUNT), max_shift); +_k++ ) { \
|
||
|
+ duration[_k] = 0.0; \
|
||
|
+ for(int _r = repeats; _r > 0; _r--) { \
|
||
|
+ memcpy(_p2, _p3, sizeof(TYPE) * (COUNT)); \
|
||
|
+ tstart = MPI_Wtime(); \
|
||
|
+ MPI_Reduce_local(_p1+_k, _p2+_k, (COUNT), (MPITYPE), (MPIOP)); \
|
||
|
+ tend = MPI_Wtime(); \
|
||
|
+ duration[_k] += (tend - tstart); \
|
||
|
+ if( check ) { \
|
||
|
+ for( i = 0; i < (COUNT); i++ ) { \
|
||
|
+ TYPE _v1 = *(_p1+_k), _v2 = *(_p2+_k), _v3 = *(_p3+_k); \
|
||
|
+ if(_v2 == OPNAME(_v1, _v3)) \
|
||
|
+ continue; \
|
||
|
+ printf("First error at alignment %d position %d (%" TYPE_PREFIX " != %s(%" TYPE_PREFIX ", %" TYPE_PREFIX ")\n", \
|
||
|
+ _k, i, _v1, (#OPNAME), _v3, _v2); \
|
||
|
+ correctness = 0; \
|
||
|
+ break; \
|
||
|
+ } \
|
||
|
} \
|
||
|
} \
|
||
|
} \
|
||
|
@@ -163,24 +179,36 @@ int main(int argc, char **argv)
|
||
|
{
|
||
|
static void *in_buf = NULL, *inout_buf = NULL, *inout_check_buf = NULL;
|
||
|
int count, type_size = 8, rank, size, provided, correctness = 1;
|
||
|
- int repeats = 1, i, c;
|
||
|
- double tstart, tend;
|
||
|
+ int repeats = 1, i, c, op1_alignment = 0, res_alignment = 0;
|
||
|
+ int max_shift = 4;
|
||
|
+ double *duration, tstart, tend;
|
||
|
bool check = true;
|
||
|
char type[5] = "uifd", *op = "sum", *mpi_type;
|
||
|
int lower = 1, upper = 1000000, skip_op_type;
|
||
|
MPI_Op mpi_op;
|
||
|
|
||
|
- while( -1 != (c = getopt(argc, argv, "l:u:t:o:s:n:vfh")) ) {
|
||
|
+ while( -1 != (c = getopt(argc, argv, "l:u:r:t:o:i:s:n:1:2:vfh")) ) {
|
||
|
switch(c) {
|
||
|
case 'l':
|
||
|
lower = atoi(optarg);
|
||
|
if( lower <= 0 ) {
|
||
|
- fprintf(stderr, "The number of elements must be positive\n");
|
||
|
+ fprintf(stderr, "The lower number of elements must be positive\n");
|
||
|
exit(-1);
|
||
|
}
|
||
|
break;
|
||
|
case 'u':
|
||
|
upper = atoi(optarg);
|
||
|
+ if( lower <= 0 ) {
|
||
|
+ fprintf(stderr, "The upper number of elements must be positive\n");
|
||
|
+ exit(-1);
|
||
|
+ }
|
||
|
+ break;
|
||
|
+ case 'i':
|
||
|
+ max_shift = atoi(optarg);
|
||
|
+ if( max_shift <= 0 ) {
|
||
|
+ fprintf(stderr, "The max shift must be positive\n");
|
||
|
+ exit(-1);
|
||
|
+ }
|
||
|
break;
|
||
|
case 'f':
|
||
|
check = false;
|
||
|
@@ -216,14 +244,32 @@ int main(int argc, char **argv)
|
||
|
exit(-1);
|
||
|
}
|
||
|
break;
|
||
|
+ case '1':
|
||
|
+ op1_alignment = atoi(optarg);
|
||
|
+ if( op1_alignment < 0 ) {
|
||
|
+ fprintf(stderr, "alignment for the first operand must be positive\n");
|
||
|
+ exit(-1);
|
||
|
+ }
|
||
|
+ break;
|
||
|
+ case '2':
|
||
|
+ res_alignment = atoi(optarg);
|
||
|
+ if( res_alignment < 0 ) {
|
||
|
+ fprintf(stderr, "alignment for the result must be positive\n");
|
||
|
+ exit(-1);
|
||
|
+ }
|
||
|
+ break;
|
||
|
case 'h':
|
||
|
fprintf(stdout, "%s options are:\n"
|
||
|
" -l <number> : lower number of elements\n"
|
||
|
" -u <number> : upper number of elements\n"
|
||
|
" -s <type_size> : 8, 16, 32 or 64 bits elements\n"
|
||
|
" -t [i,u,f,d] : type of the elements to apply the operations on\n"
|
||
|
+ " -r <number> : number of repetitions for each test\n"
|
||
|
" -o <op> : comma separated list of operations to execute among\n"
|
||
|
" sum, min, max, prod, bor, bxor, band\n"
|
||
|
+ " -i <number> : shift on all buffers to check alignment\n"
|
||
|
+ " -1 <number> : (mis)alignment in elements for the first op\n"
|
||
|
+ " -2 <number> : (mis)alignment in elements for the result\n"
|
||
|
" -v: increase the verbosity level\n"
|
||
|
" -h: this help message\n", argv[0]);
|
||
|
exit(0);
|
||
|
@@ -233,9 +279,10 @@ int main(int argc, char **argv)
|
||
|
if( !do_ops_built ) { /* not yet done, take the default */
|
||
|
build_do_ops( "all", do_ops);
|
||
|
}
|
||
|
- in_buf = malloc(upper * sizeof(double));
|
||
|
- inout_buf = malloc(upper * sizeof(double));
|
||
|
- inout_check_buf = malloc(upper * sizeof(double));
|
||
|
+ posix_memalign( &in_buf, 64, (upper + op1_alignment) * sizeof(double));
|
||
|
+ posix_memalign( &inout_buf, 64, (upper + res_alignment) * sizeof(double));
|
||
|
+ posix_memalign( &inout_check_buf, 64, upper * sizeof(double));
|
||
|
+ duration = (double*)malloc(max_shift * sizeof(double));
|
||
|
|
||
|
ompi_mpi_init(argc, argv, MPI_THREAD_SERIALIZED, &provided, false);
|
||
|
|
||
|
@@ -253,8 +300,8 @@ int main(int argc, char **argv)
|
||
|
correctness = 1;
|
||
|
if('i' == type[type_idx]) {
|
||
|
if( 8 == type_size ) {
|
||
|
- int8_t *in_int8 = (int8_t*)in_buf,
|
||
|
- *inout_int8 = (int8_t*)inout_buf,
|
||
|
+ int8_t *in_int8 = (int8_t*)((char*)in_buf + op1_alignment * sizeof(int8_t)),
|
||
|
+ *inout_int8 = (int8_t*)((char*)inout_buf + res_alignment * sizeof(int8_t)),
|
||
|
*inout_int8_for_check = (int8_t*)inout_check_buf;
|
||
|
for( i = 0; i < count; i++ ) {
|
||
|
in_int8[i] = 5;
|
||
|
@@ -299,8 +346,8 @@ int main(int argc, char **argv)
|
||
|
}
|
||
|
}
|
||
|
if( 16 == type_size ) {
|
||
|
- int16_t *in_int16 = (int16_t*)in_buf,
|
||
|
- *inout_int16 = (int16_t*)inout_buf,
|
||
|
+ int16_t *in_int16 = (int16_t*)((char*)in_buf + op1_alignment * sizeof(int16_t)),
|
||
|
+ *inout_int16 = (int16_t*)((char*)inout_buf + res_alignment * sizeof(int16_t)),
|
||
|
*inout_int16_for_check = (int16_t*)inout_check_buf;
|
||
|
for( i = 0; i < count; i++ ) {
|
||
|
in_int16[i] = 5;
|
||
|
@@ -345,8 +392,8 @@ int main(int argc, char **argv)
|
||
|
}
|
||
|
}
|
||
|
if( 32 == type_size ) {
|
||
|
- int32_t *in_int32 = (int32_t*)in_buf,
|
||
|
- *inout_int32 = (int32_t*)inout_buf,
|
||
|
+ int32_t *in_int32 = (int32_t*)((char*)in_buf + op1_alignment * sizeof(int32_t)),
|
||
|
+ *inout_int32 = (int32_t*)((char*)inout_buf + res_alignment * sizeof(int32_t)),
|
||
|
*inout_int32_for_check = (int32_t*)inout_check_buf;
|
||
|
for( i = 0; i < count; i++ ) {
|
||
|
in_int32[i] = 5;
|
||
|
@@ -391,8 +438,8 @@ int main(int argc, char **argv)
|
||
|
}
|
||
|
}
|
||
|
if( 64 == type_size ) {
|
||
|
- int64_t *in_int64 = (int64_t*)in_buf,
|
||
|
- *inout_int64 = (int64_t*)inout_buf,
|
||
|
+ int64_t *in_int64 = (int64_t*)((char*)in_buf + op1_alignment * sizeof(int64_t)),
|
||
|
+ *inout_int64 = (int64_t*)((char*)inout_buf + res_alignment * sizeof(int64_t)),
|
||
|
*inout_int64_for_check = (int64_t*)inout_check_buf;
|
||
|
for( i = 0; i < count; i++ ) {
|
||
|
in_int64[i] = 5;
|
||
|
@@ -440,8 +487,8 @@ int main(int argc, char **argv)
|
||
|
|
||
|
if( 'u' == type[type_idx] ) {
|
||
|
if( 8 == type_size ) {
|
||
|
- uint8_t *in_uint8 = (uint8_t*)in_buf,
|
||
|
- *inout_uint8 = (uint8_t*)inout_buf,
|
||
|
+ uint8_t *in_uint8 = (uint8_t*)((char*)in_buf + op1_alignment * sizeof(uint8_t)),
|
||
|
+ *inout_uint8 = (uint8_t*)((char*)inout_buf + res_alignment * sizeof(uint8_t)),
|
||
|
*inout_uint8_for_check = (uint8_t*)inout_check_buf;
|
||
|
for( i = 0; i < count; i++ ) {
|
||
|
in_uint8[i] = 5;
|
||
|
@@ -486,8 +533,8 @@ int main(int argc, char **argv)
|
||
|
}
|
||
|
}
|
||
|
if( 16 == type_size ) {
|
||
|
- uint16_t *in_uint16 = (uint16_t*)in_buf,
|
||
|
- *inout_uint16 = (uint16_t*)inout_buf,
|
||
|
+ uint16_t *in_uint16 = (uint16_t*)((char*)in_buf + op1_alignment * sizeof(uint16_t)),
|
||
|
+ *inout_uint16 = (uint16_t*)((char*)inout_buf + res_alignment * sizeof(uint16_t)),
|
||
|
*inout_uint16_for_check = (uint16_t*)inout_check_buf;
|
||
|
for( i = 0; i < count; i++ ) {
|
||
|
in_uint16[i] = 5;
|
||
|
@@ -532,8 +579,8 @@ int main(int argc, char **argv)
|
||
|
}
|
||
|
}
|
||
|
if( 32 == type_size ) {
|
||
|
- uint32_t *in_uint32 = (uint32_t*)in_buf,
|
||
|
- *inout_uint32 = (uint32_t*)inout_buf,
|
||
|
+ uint32_t *in_uint32 = (uint32_t*)((char*)in_buf + op1_alignment * sizeof(uint32_t)),
|
||
|
+ *inout_uint32 = (uint32_t*)((char*)inout_buf + res_alignment * sizeof(uint32_t)),
|
||
|
*inout_uint32_for_check = (uint32_t*)inout_check_buf;
|
||
|
for( i = 0; i < count; i++ ) {
|
||
|
in_uint32[i] = 5;
|
||
|
@@ -578,8 +625,8 @@ int main(int argc, char **argv)
|
||
|
}
|
||
|
}
|
||
|
if( 64 == type_size ) {
|
||
|
- uint64_t *in_uint64 = (uint64_t*)in_buf,
|
||
|
- *inout_uint64 = (uint64_t*)inout_buf,
|
||
|
+ uint64_t *in_uint64 = (uint64_t*)((char*)in_buf + op1_alignment * sizeof(uint64_t)),
|
||
|
+ *inout_uint64 = (uint64_t*)((char*)inout_buf + res_alignment * sizeof(uint64_t)),
|
||
|
*inout_uint64_for_check = (uint64_t*)inout_check_buf;
|
||
|
for( i = 0; i < count; i++ ) {
|
||
|
in_uint64[i] = 5;
|
||
|
@@ -626,8 +673,8 @@ int main(int argc, char **argv)
|
||
|
}
|
||
|
|
||
|
if( 'f' == type[type_idx] ) {
|
||
|
- float *in_float = (float*)in_buf,
|
||
|
- *inout_float = (float*)inout_buf,
|
||
|
+ float *in_float = (float*)((char*)in_buf + op1_alignment * sizeof(float)),
|
||
|
+ *inout_float = (float*)((char*)inout_buf + res_alignment * sizeof(float)),
|
||
|
*inout_float_for_check = (float*)inout_check_buf;
|
||
|
for( i = 0; i < count; i++ ) {
|
||
|
in_float[i] = 1000.0+1;
|
||
|
@@ -658,8 +705,8 @@ int main(int argc, char **argv)
|
||
|
}
|
||
|
|
||
|
if( 'd' == type[type_idx] ) {
|
||
|
- double *in_double = (double*)in_buf,
|
||
|
- *inout_double = (double*)inout_buf,
|
||
|
+ double *in_double = (double*)((char*)in_buf + op1_alignment * sizeof(double)),
|
||
|
+ *inout_double = (double*)((char*)inout_buf + res_alignment * sizeof(double)),
|
||
|
*inout_double_for_check = (double*)inout_check_buf;
|
||
|
for( i = 0; i < count; i++ ) {
|
||
|
in_double[i] = 10.0+1;
|
||
|
@@ -691,7 +738,7 @@ int main(int argc, char **argv)
|
||
|
check_and_continue:
|
||
|
if( !skip_op_type )
|
||
|
print_status(array_of_ops[do_ops[op_idx]].mpi_op_name,
|
||
|
- mpi_type, type_size, count, tend-tstart, correctness);
|
||
|
+ mpi_type, type_size, count, max_shift, duration, repeats, correctness);
|
||
|
}
|
||
|
if( !skip_op_type )
|
||
|
printf("\n");
|