--- /dev/null
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ * Copyright (c) 2012,2013, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+/* The macros in this file are intended to be used for writing
+ * architecture-independent SIMD intrinsics code with a SIMD width of 4.
+ * To support a new architecture, adding macros here should be all
+ * that is needed.
+ *
+ * Note that this file is intended only for SIMD operations that require
+ * a SIMD width of 4. In general gmx_simd_macros.h provides wider hardware
+ * support, more functionality and higher performance, but the SIMD width is
+ * not necessarily equal to 4.
+ */
+
+#ifdef _gmx_simd4_macros_h_
+#error "gmx_simd4_macros.h included twice"
+#else
+#define _gmx_simd4_macros_h_
+
+
+/* The SIMD width here is always 4, since that is the whole point */
+#define GMX_SIMD4_WIDTH 4
+
+
+#if defined GMX_SIMD4_SINGLE || defined GMX_SIMD4_DOUBLE
+/* Precision set before inclusion, honour that request */
+#else
+/* Match precision to the Gromacs real precision */
+#ifdef GMX_DOUBLE
+#define GMX_SIMD4_DOUBLE
+#else
+#define GMX_SIMD4_SINGLE
+#endif
+#endif
+
+#ifdef GMX_SIMD4_DOUBLE
+typedef double gmx_simd4_real;
+#endif
+#ifdef GMX_SIMD4_SINGLE
+typedef float gmx_simd4_real;
+#endif
+
+/* Uncomment the next line, without other SIMD active, for testing plain-C */
+/* #define GMX_SIMD4_REFERENCE_PLAIN_C */
+#ifdef GMX_SIMD4_REFERENCE_PLAIN_C
+/* Plain C SIMD reference implementation, also serves as documentation */
+#define GMX_HAVE_SIMD4_MACROS
+
+/* Include plain-C reference implementation, also serves as documentation */
+#include "gmx_simd4_ref.h"
+
+/* float/double SIMD register type */
+#define gmx_simd4_pr gmx_simd4_ref_pr
+
+/* boolean SIMD register type */
+#define gmx_simd4_pb gmx_simd4_ref_pb
+
+#define gmx_simd4_load_pr gmx_simd4_ref_load_pr
+#define gmx_simd4_set1_pr gmx_simd4_ref_set1_pr
+#define gmx_simd4_setzero_pr gmx_simd4_ref_setzero_pr
+#define gmx_simd4_store_pr gmx_simd4_ref_store_pr
+
+/* Unaligned load+store are not required,
+ * but they can speed up the PME spread+gather operations.
+ */
+#define GMX_SIMD4_HAVE_UNALIGNED
+#ifdef GMX_SIMD4_HAVE_UNALIGNED
+#define gmx_simd4_loadu_pr gmx_simd4_ref_load_pr
+#define gmx_simd4_storeu_pr gmx_simd4_ref_store_pr
+#endif
+
+#define gmx_simd4_add_pr gmx_simd4_ref_add_pr
+#define gmx_simd4_sub_pr gmx_simd4_ref_sub_pr
+#define gmx_simd4_mul_pr gmx_simd4_ref_mul_pr
+/* For the FMA macros below, aim for c=d in code, so FMA3 uses 1 instruction */
+#define gmx_simd4_madd_pr gmx_simd4_ref_madd_pr
+#define gmx_simd4_nmsub_pr gmx_simd4_ref_nmsub_pr
+
+#define gmx_simd4_dotproduct3 gmx_simd4_ref_dotproduct3
+
+#define gmx_simd4_min_pr gmx_simd4_ref_min_pr
+#define gmx_simd4_max_pr gmx_simd4_ref_max_pr
+
+#define gmx_simd4_blendzero_pr gmx_simd4_ref_blendzero_pr
+
+/* Comparison */
+#define gmx_simd4_cmplt_pr gmx_simd4_ref_cmplt_pr
+
+/* Logical operations on SIMD booleans */
+#define gmx_simd4_and_pb gmx_simd4_ref_and_pb
+#define gmx_simd4_or_pb gmx_simd4_ref_or_pb
+
+/* Returns a single int (0/1) which tells if any of the 4 booleans is True */
+#define gmx_simd4_anytrue_pb gmx_simd4_ref_anytrue_pb
+
+#endif /* GMX_SIMD4_REFERENCE_PLAIN_C */
+
+
+/* The same SIMD macros can be translated to SIMD intrinsics (and compiled
+ * to instructions for) different SIMD width and float precision.
+ *
+ * On x86: The gmx_simd4 prefix is replaced by _mm_ or _mm256_ (SSE or AVX).
+ * The _pr suffix is replaced by _ps or _pd (for single or double precision).
+ * Compiler settings will decide if 128-bit intrinsics will
+ * be translated into SSE or AVX instructions.
+ */
+
+
+#ifdef GMX_X86_SSE2
+/* This is for general x86 SIMD instruction sets that also support SSE2 */
+
+#ifdef GMX_SIMD4_SINGLE
+#define GMX_HAVE_SIMD4_MACROS
+#endif
+
+#ifdef GMX_SIMD4_DOUBLE
+/* Note that here we will use 256-bit SIMD with GMX_X86_AVX_128_FMA.
+ * This is inconsistent naming wise, but should give the best performance.
+ */
+#if defined GMX_X86_AVX_128_FMA || defined GMX_X86_AVX_256
+#define GMX_HAVE_SIMD4_MACROS
+#endif
+#endif
+
+#ifdef GMX_HAVE_SIMD4_MACROS
+
+#if defined GMX_X86_AVX_128_FMA || defined GMX_X86_AVX_256
+
+#include <immintrin.h>
+#ifdef HAVE_X86INTRIN_H
+#include <x86intrin.h> /* FMA */
+#endif
+#ifdef HAVE_INTRIN_H
+#include <intrin.h> /* FMA MSVC */
+#endif
+
+#else
+#ifdef GMX_X86_SSE4_1
+#include <smmintrin.h>
+#else
+/* We only have SSE2 */
+#include <emmintrin.h>
+#endif
+#endif
+
+#ifdef GMX_SIMD4_SINGLE
+
+#define gmx_simd4_pr __m128
+
+#define gmx_simd4_pb __m128
+
+#define gmx_simd4_load_pr _mm_load_ps
+#define gmx_simd4_set1_pr _mm_set1_ps
+#define gmx_simd4_setzero_pr _mm_setzero_ps
+#define gmx_simd4_store_pr _mm_store_ps
+
+/* Some old AMD processors could have problems with unaligned loads+stores */
+#ifndef GMX_FAHCORE
+#define GMX_SIMD4_HAVE_UNALIGNED
+#endif
+#ifdef GMX_SIMD4_HAVE_UNALIGNED
+#define gmx_simd4_loadu_pr _mm_loadu_ps
+#define gmx_simd4_storeu_pr _mm_storeu_ps
+#endif
+
+#define gmx_simd4_add_pr _mm_add_ps
+#define gmx_simd4_sub_pr _mm_sub_ps
+#define gmx_simd4_mul_pr _mm_mul_ps
+
+#ifdef GMX_X86_AVX_128_FMA
+#define gmx_simd4_madd_pr(a, b, c) _mm_macc_ps(a, b, c)
+#define gmx_simd4_nmsub_pr(a, b, c) _mm_nmacc_ps(a, b, c)
+#else
+#define gmx_simd4_madd_pr(a, b, c) _mm_add_ps(c, _mm_mul_ps(a, b))
+#define gmx_simd4_nmsub_pr(a, b, c) _mm_sub_ps(c, _mm_mul_ps(a, b))
+#endif
+
+static inline float gmx_simd4_dotproduct3(__m128 a, __m128 b)
+#ifdef GMX_X86_SSE4_1
+{
+ float dp;
+
+ /* SSE4.1 dot product of components 0,1,2, stored in component 0 */
+ _mm_store_ss(&dp, _mm_dp_ps(a, b, 0x71));
+
+ return dp;
+}
+#else
+{
+ float dp_array[7], *dp;
+
+ /* Generate an aligned pointer */
+ dp = (float *)(((size_t)(dp_array+3)) & (~((size_t)15)));
+
+ _mm_store_ps(dp, _mm_mul_ps(a, b));
+
+ return dp[0] + dp[1] + dp[2];
+}
+#endif
+
+#define gmx_simd4_min_pr _mm_min_ps
+#define gmx_simd4_max_pr _mm_max_ps
+
+#define gmx_simd4_blendzero_pr _mm_and_ps
+
+#define gmx_simd4_cmplt_pr _mm_cmplt_ps
+#define gmx_simd4_and_pb _mm_and_ps
+#define gmx_simd4_or_pb _mm_or_ps
+
+#define gmx_simd4_anytrue_pb _mm_movemask_ps
+
+#endif /* GMX_SIMD4_SINGLE */
+
+
+#ifdef GMX_SIMD4_DOUBLE
+
+#define gmx_simd4_pr __m256d
+
+#define gmx_simd4_pb __m256d
+
+#define gmx_simd4_load_pr _mm256_load_pd
+#define gmx_simd4_set1_pr _mm256_set1_pd
+#define gmx_simd4_setzero_pr _mm256_setzero_pd
+#define gmx_simd4_store_pr _mm256_store_pd
+
+#define GMX_SIMD4_HAVE_UNALIGNED
+#define gmx_simd4_loadu_pr _mm256_loadu_pd
+#define gmx_simd4_storeu_pr _mm256_storeu_pd
+
+#define gmx_simd4_add_pr _mm256_add_pd
+#define gmx_simd4_sub_pr _mm256_sub_pd
+#define gmx_simd4_mul_pr _mm256_mul_pd
+#ifdef GMX_X86_AVX_128_FMA
+#define gmx_simd4_madd_pr(a, b, c) _mm256_macc_pd(a, b, c)
+#define gmx_simd4_nmsub_pr(a, b, c) _mm256_nmacc_pd(a, b, c)
+#else
+#define gmx_simd4_madd_pr(a, b, c) _mm256_add_pd(c, _mm256_mul_pd(a, b))
+#define gmx_simd4_nmsub_pr(a, b, c) _mm256_sub_pd(c, _mm256_mul_pd(a, b))
+#endif
+#define gmx_simd4_min_pr _mm256_min_pd
+#define gmx_simd4_max_pr _mm256_max_pd
+
+#define gmx_simd4_blendzero_pr _mm256_and_pd
+
+/* Less-than (we use ordered, non-signaling, but that's not required) */
+#define gmx_simd4_cmplt_pr(x, y) _mm256_cmp_pd(x, y, 0x11)
+#define gmx_simd4_and_pb _mm256_and_pd
+#define gmx_simd4_or_pb _mm256_or_pd
+
+#define gmx_simd4_anytrue_pb _mm256_movemask_pd
+
+#endif /* GMX_SIMD4_DOUBLE */
+
+
+#endif /* GMX_HAVE_SIMD4_MACROS */
+
+
+#endif /* GMX_X86_SSE2 */
+
+
+#ifdef GMX_HAVE_SIMD4_MACROS
+/* Generic functions to extract a SIMD4 aligned pointer from a pointer x.
+ * x should have at least GMX_SIMD4_WIDTH=4 elements extra compared
+ * to how many you want to use, to avoid indexing outside the aligned region.
+ */
+
+static gmx_inline gmx_simd4_real *
+gmx_simd4_align_real(const gmx_simd4_real *x)
+{
+ return (gmx_simd4_real *)(((size_t)((x)+GMX_SIMD4_WIDTH)) & (~((size_t)(GMX_SIMD4_WIDTH*sizeof(gmx_simd4_real)-1))));
+}
+#endif
+
+
+#endif /* _gmx_simd4_macros_h_ */
--- /dev/null
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ * Copyright (c) 2012,2013, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+#ifndef _gmx_simd4_ref_h_
+#define _gmx_simd4_ref_h_
+
+/* This file contains a reference plain-C implementation of 4-wide SIMD.
+ * This code is only useful for testing and documentation.
+ * Either float or double precision is supported through gmx_simd4_real,
+ * which is set in gmx_simd4_macros.h
+ */
+
+
+#include <math.h>
+
+/* float/double SIMD register type */
+typedef struct {
+ gmx_simd4_real r[GMX_SIMD4_WIDTH];
+} gmx_simd4_ref_pr;
+
+/* boolean SIMD register type */
+typedef struct {
+ char r[GMX_SIMD4_WIDTH];
+} gmx_simd4_ref_pb;
+
+
+/* Load GMX_SIMD4_WIDTH reals for memory starting at r */
+static gmx_inline gmx_simd4_ref_pr
+gmx_simd4_ref_load_pr(const gmx_simd4_real *r)
+{
+ gmx_simd4_ref_pr a;
+ int i;
+
+ for (i = 0; i < GMX_SIMD4_WIDTH; i++)
+ {
+ a.r[i] = r[i];
+ }
+
+ return a;
+}
+
+/* Set all SIMD register elements to r */
+static gmx_inline gmx_simd4_ref_pr
+gmx_simd4_ref_set1_pr(gmx_simd4_real r)
+{
+ gmx_simd4_ref_pr a;
+ int i;
+
+ for (i = 0; i < GMX_SIMD4_WIDTH; i++)
+ {
+ a.r[i] = r;
+ }
+
+ return a;
+}
+
+/* Set all SIMD register elements to 0 */
+static gmx_inline gmx_simd4_ref_pr
+gmx_simd4_ref_setzero_pr()
+{
+ gmx_simd4_ref_pr a;
+ int i;
+
+ for (i = 0; i < GMX_SIMD4_WIDTH; i++)
+ {
+ a.r[i] = 0.0;
+ }
+
+ return a;
+}
+
+static gmx_inline void
+gmx_simd4_ref_store_pr(gmx_simd4_real *dest, gmx_simd4_ref_pr src)
+{
+ int i;
+
+ for (i = 0; i < GMX_SIMD4_WIDTH; i++)
+ {
+ dest[i] = src.r[i];
+ }
+}
+
+static gmx_inline gmx_simd4_ref_pr
+gmx_simd4_ref_add_pr(gmx_simd4_ref_pr a, gmx_simd4_ref_pr b)
+{
+ gmx_simd4_ref_pr c;
+ int i;
+
+ for (i = 0; i < GMX_SIMD4_WIDTH; i++)
+ {
+ c.r[i] = a.r[i] + b.r[i];
+ }
+
+ return c;
+}
+
+static gmx_inline gmx_simd4_ref_pr
+gmx_simd4_ref_sub_pr(gmx_simd4_ref_pr a, gmx_simd4_ref_pr b)
+{
+ gmx_simd4_ref_pr c;
+ int i;
+
+ for (i = 0; i < GMX_SIMD4_WIDTH; i++)
+ {
+ c.r[i] = a.r[i] - b.r[i];
+ }
+
+ return c;
+}
+
+static gmx_inline gmx_simd4_ref_pr
+gmx_simd4_ref_mul_pr(gmx_simd4_ref_pr a, gmx_simd4_ref_pr b)
+{
+ gmx_simd4_ref_pr c;
+ int i;
+
+ for (i = 0; i < GMX_SIMD4_WIDTH; i++)
+ {
+ c.r[i] = a.r[i]*b.r[i];
+ }
+
+ return c;
+}
+
+static gmx_inline gmx_simd4_ref_pr
+gmx_simd4_ref_madd_pr(gmx_simd4_ref_pr a, gmx_simd4_ref_pr b, gmx_simd4_ref_pr c)
+{
+ gmx_simd4_ref_pr d;
+ int i;
+
+ for (i = 0; i < GMX_SIMD4_WIDTH; i++)
+ {
+ d.r[i] = a.r[i]*b.r[i] + c.r[i];
+ }
+
+ return d;
+}
+
+static gmx_inline gmx_simd4_ref_pr
+gmx_simd4_ref_nmsub_pr(gmx_simd4_ref_pr a, gmx_simd4_ref_pr b, gmx_simd4_ref_pr c)
+{
+ gmx_simd4_ref_pr d;
+ int i;
+
+ for (i = 0; i < GMX_SIMD4_WIDTH; i++)
+ {
+ d.r[i] = -a.r[i]*b.r[i] + c.r[i];
+ }
+
+ return d;
+}
+
+static gmx_inline gmx_simd4_real
+gmx_simd4_ref_dotproduct3(gmx_simd4_ref_pr a, gmx_simd4_ref_pr b)
+{
+ gmx_simd4_real dp;
+ int i;
+
+ dp = 0.0;
+ for (i = 0; i < 3; i++)
+ {
+ dp += a.r[i]*b.r[i];
+ }
+
+ return dp;
+}
+
+static gmx_inline gmx_simd4_ref_pr
+gmx_simd4_ref_min_pr(gmx_simd4_ref_pr a, gmx_simd4_ref_pr b)
+{
+ gmx_simd4_ref_pr c;
+ int i;
+
+ for (i = 0; i < GMX_SIMD4_WIDTH; i++)
+ {
+ c.r[i] = (a.r[i] <= b.r[i] ? a.r[i] : b.r[i]);
+ }
+
+ return c;
+}
+
+static gmx_inline gmx_simd4_ref_pr
+gmx_simd4_ref_max_pr(gmx_simd4_ref_pr a, gmx_simd4_ref_pr b)
+{
+ gmx_simd4_ref_pr c;
+ int i;
+
+ for (i = 0; i < GMX_SIMD4_WIDTH; i++)
+ {
+ c.r[i] = (a.r[i] >= b.r[i] ? a.r[i] : b.r[i]);
+ }
+
+ return c;
+}
+
+static gmx_inline gmx_simd4_ref_pr
+gmx_simd4_ref_blendzero_pr(gmx_simd4_ref_pr a, gmx_simd4_ref_pb b)
+{
+ gmx_simd4_ref_pr c;
+ int i;
+
+ for (i = 0; i < GMX_SIMD4_WIDTH; i++)
+ {
+ c.r[i] = (b.r[i] ? a.r[i] : 0.0);
+ }
+
+ return c;
+}
+
+/* Comparison */
+static gmx_inline gmx_simd4_ref_pb
+gmx_simd4_ref_cmplt_pr(gmx_simd4_ref_pr a, gmx_simd4_ref_pr b)
+{
+ gmx_simd4_ref_pb c;
+ int i;
+
+ for (i = 0; i < GMX_SIMD4_WIDTH; i++)
+ {
+ c.r[i] = (a.r[i] < b.r[i]);
+ }
+
+ return c;
+}
+
+/* Logical AND on SIMD booleans */
+static gmx_inline gmx_simd4_ref_pb
+gmx_simd4_ref_and_pb(gmx_simd4_ref_pb a, gmx_simd4_ref_pb b)
+{
+ gmx_simd4_ref_pb c;
+ int i;
+
+ for (i = 0; i < GMX_SIMD4_WIDTH; i++)
+ {
+ c.r[i] = (a.r[i] && b.r[i]);
+ }
+
+ return c;
+}
+
+/* Logical OR on SIMD booleans */
+static gmx_inline gmx_simd4_ref_pb
+gmx_simd4_ref_or_pb(gmx_simd4_ref_pb a, gmx_simd4_ref_pb b)
+{
+ gmx_simd4_ref_pb c;
+ int i;
+
+ for (i = 0; i < GMX_SIMD4_WIDTH; i++)
+ {
+ c.r[i] = (a.r[i] || b.r[i]);
+ }
+
+ return c;
+}
+
+/* gmx_anytrue_pb(x) returns if any of the boolean is x is True */
+static gmx_inline int
+gmx_simd4_ref_anytrue_pb(gmx_simd4_ref_pb a)
+{
+ int anytrue;
+ int i;
+
+ anytrue = 0;
+ for (i = 0; i < GMX_SIMD4_WIDTH; i++)
+ {
+ if (a.r[i])
+ {
+ anytrue = 1;
+ }
+ }
+
+ return anytrue;
+}
+
+#endif /* _gmx_simd4_ref_h_ */
#define gmx_and_pb gmx_simd_ref_and_pb
#define gmx_or_pb gmx_simd_ref_or_pb
-/* Not required, gmx_anytrue_pb(x) returns if any of the boolean is x is True.
- * If this is not present, define GMX_SIMD_IS_TRUE(real x),
- * which should return x==True, where True is True as defined in SIMD.
- */
-#define GMX_SIMD_HAVE_ANYTRUE
-#ifdef GMX_SIMD_HAVE_ANYTRUE
+/* Returns a single int (0/1) which tells if any of the 4 booleans is True */
#define gmx_anytrue_pb gmx_simd_ref_anytrue_pb
-#else
-/* If we don't have gmx_anytrue_pb, we need to store gmx_mm_pb */
-#define gmx_store_pb gmx_simd_ref_store_pb
-#endif
/* Conversions only used for PME table lookup */
#define gmx_cvttpr_epi32 gmx_simd_ref_cvttpr_epi32
static gmx_inline gmx_mm_pr gmx_masknot_add_pr(gmx_mm_pb a, gmx_mm_pr b, gmx_mm_pr c) { return _mm_add_ps(b, _mm_andnot_ps(a, c)); };
-#define GMX_SIMD_HAVE_ANYTRUE
#define gmx_anytrue_pb _mm_movemask_ps
#define gmx_cvttpr_epi32 _mm_cvttps_epi32
#define gmx_and_pb _mm_and_pd
#define gmx_or_pb _mm_or_pd
-#define GMX_SIMD_HAVE_ANYTRUE
#define gmx_anytrue_pb _mm_movemask_pd
#define gmx_cvttpr_epi32 _mm_cvttpd_epi32
#define gmx_and_pb _mm256_and_ps
#define gmx_or_pb _mm256_or_ps
-#define GMX_SIMD_HAVE_ANYTRUE
#define gmx_anytrue_pb _mm256_movemask_ps
#define gmx_cvttpr_epi32 _mm256_cvttps_epi32
#define gmx_and_pb _mm256_and_pd
#define gmx_or_pb _mm256_or_pd
-#define GMX_SIMD_HAVE_ANYTRUE
#define gmx_anytrue_pb _mm256_movemask_pd
#define gmx_cvttpr_epi32 _mm256_cvttpd_epi32
return c;
}
-/* Not required, gmx_anytrue_pb(x) returns if any of the boolean is x is True.
- * If this is not present, define GMX_SIMD_IS_TRUE(real x),
- * which should return x==True, where True is True as defined in SIMD.
- */
+/* Returns a single int (0/1) which tells if any of the booleans is True */
static gmx_inline int
gmx_simd_ref_anytrue_pb(gmx_simd_ref_pb a)
{
return anytrue;
}
-/* If we don't have gmx_anytrue_pb, we need to store gmx_mm_pb */
-static gmx_inline void
-gmx_simd_ref_store_pb(real *dest, gmx_simd_ref_pb src)
-{
- int i;
-
- for (i = 0; i < GMX_SIMD_REF_WIDTH; i++)
- {
- dest[i] = src.r[i];
- }
-};
-
/* Conversions only used for PME table lookup */
static gmx_inline gmx_simd_ref_epi32
gmx_simd_ref_cvttpr_epi32(gmx_simd_ref_pr a)
#include "gmx_simd_macros.h"
#endif
-#ifdef __cplusplus
-extern "C" {
+
+/* Bounding box calculations are (currently) always in single precision.
+ * This uses less (cache-)memory and SIMD is faster, at least on x86.
+ */
+#define GMX_SIMD4_SINGLE
+/* Include the 4-wide SIMD macro file */
+#include "gmx_simd4_macros.h"
+/* Check if we have 4-wide SIMD macro support */
+#ifdef GMX_HAVE_SIMD4_MACROS
+#define NBNXN_SEARCH_BB_SIMD4
#endif
-#ifdef GMX_X86_SSE2
-/* Use 4-way SIMD for, always, single precision bounding box calculations */
-#define NBNXN_SEARCH_BB_SSE
+#ifdef __cplusplus
+extern "C" {
#endif
#endif
+#ifdef NBNXN_SEARCH_BB_SIMD4
+/* Memory alignment in bytes as required by SIMD aligned loads/stores */
+#define NBNXN_SEARCH_BB_MEM_ALIGN (GMX_SIMD4_WIDTH*sizeof(float))
+#else
+/* No alignment required, but set it so we can call the same routines */
+#define NBNXN_SEARCH_BB_MEM_ALIGN 32
+#endif
+
+
/* Pair search box lower and upper corner in x,y,z.
- * Store this in 4 iso 3 reals, which is useful with SSE.
- * To avoid complicating the code we also use 4 without SSE.
+ * Store this in 4 iso 3 reals, which is useful with 4-wide SIMD.
+ * To avoid complicating the code we also use 4 without 4-wide SIMD.
*/
#define NNBSBB_C 4
/* Pair search box lower and upper bound in z only. */
int *nsubc; /* The number of sub cells for each super cell */
float *bbcz; /* Bounding boxes in z for the super cells */
nbnxn_bb_t *bb; /* 3D bounding boxes for the sub cells */
- nbnxn_bb_t *bbj; /* 3D j-b.boxes for SSE-double or AVX-single */
+ nbnxn_bb_t *bbj; /* 3D j-bounding boxes for the case where *
+ * the i- and j-cluster sizes are different */
float *pbb; /* 3D b. boxes in xxxx format per super cell */
int *flags; /* Flag for the super cells */
int nc_nalloc; /* Allocation size for the pointers above */
typedef struct nbnxn_x_ci_simd_4xn {
/* The i-cluster coordinates for simple search */
- gmx_mm_pr ix_SSE0, iy_SSE0, iz_SSE0;
- gmx_mm_pr ix_SSE1, iy_SSE1, iz_SSE1;
- gmx_mm_pr ix_SSE2, iy_SSE2, iz_SSE2;
- gmx_mm_pr ix_SSE3, iy_SSE3, iz_SSE3;
+ gmx_mm_pr ix_S0, iy_S0, iz_S0;
+ gmx_mm_pr ix_S1, iy_S1, iz_S1;
+ gmx_mm_pr ix_S2, iy_S2, iz_S2;
+ gmx_mm_pr ix_S3, iy_S3, iz_S3;
} nbnxn_x_ci_simd_4xn_t;
typedef struct nbnxn_x_ci_simd_2xnn {
/* The i-cluster coordinates for simple search */
- gmx_mm_pr ix_SSE0, iy_SSE0, iz_SSE0;
- gmx_mm_pr ix_SSE2, iy_SSE2, iz_SSE2;
+ gmx_mm_pr ix_S0, iy_S0, iz_S0;
+ gmx_mm_pr ix_S2, iy_S2, iz_S2;
} nbnxn_x_ci_simd_2xnn_t;
#endif
#include "nrnb.h"
-#ifdef NBNXN_SEARCH_BB_SSE
-/* We use SSE or AVX-128bit for bounding box calculations */
+#ifdef NBNXN_SEARCH_BB_SIMD4
+/* We use 4-wide SIMD for bounding box calculations */
#ifndef GMX_DOUBLE
-/* Single precision BBs + coordinates, we can also load coordinates using SSE */
-#define NBNXN_SEARCH_SSE_SINGLE
+/* Single precision BBs + coordinates, we can also load coordinates with SIMD */
+#define NBNXN_SEARCH_SIMD4_FLOAT_X_BB
#endif
-/* Include basic SSE2 stuff */
-#include <emmintrin.h>
-
-#if defined NBNXN_SEARCH_SSE_SINGLE && (GPU_NSUBCELL == 4 || GPU_NSUBCELL == 8)
+#if defined NBNXN_SEARCH_SIMD4_FLOAT_X_BB && (GPU_NSUBCELL == 4 || GPU_NSUBCELL == 8)
/* Store bounding boxes with x, y and z coordinates in packs of 4 */
-#define NBNXN_PBB_SSE
+#define NBNXN_PBB_SIMD4
#endif
-/* The width of SSE/AVX128 with single precision for bounding boxes with GPU.
- * Here AVX-256 turns out to be slightly slower than AVX-128.
+/* The packed bounding box coordinate stride is always set to 4.
+ * With AVX we could use 8, but that turns out not to be faster.
*/
#define STRIDE_PBB 4
#define STRIDE_PBB_2LOG 2
-#endif /* NBNXN_SEARCH_BB_SSE */
+#endif /* NBNXN_SEARCH_BB_SIMD4 */
#ifdef GMX_NBNXN_SIMD
#endif /* GMX_NBNXN_SIMD */
-#ifdef NBNXN_SEARCH_BB_SSE
+#ifdef NBNXN_SEARCH_BB_SIMD4
/* Store bounding boxes corners as quadruplets: xxxxyyyyzzzz */
#define NBNXN_BBXXXX
/* Size of bounding box corners quadruplet */
sfree_aligned(grid->bb);
/* This snew also zeros the contents, this avoid possible
- * floating exceptions in SSE with the unused bb elements.
+ * floating exceptions in SIMD with the unused bb elements.
*/
if (grid->bSimple)
{
/* Set the "empty" bounding box to the same as the first one,
* so we don't need to treat special cases in the rest of the code.
*/
-#ifdef NBNXN_SEARCH_BB_SSE
- _mm_store_ps(&bbj[1].lower[0], _mm_load_ps(&bbj[0].lower[0]));
- _mm_store_ps(&bbj[1].upper[0], _mm_load_ps(&bbj[0].upper[0]));
+#ifdef NBNXN_SEARCH_BB_SIMD4
+ gmx_simd4_store_pr(&bbj[1].lower[0], gmx_simd4_load_pr(&bbj[0].lower[0]));
+ gmx_simd4_store_pr(&bbj[1].upper[0], gmx_simd4_load_pr(&bbj[0].upper[0]));
#else
bbj[1] = bbj[0];
#endif
}
-#ifdef NBNXN_SEARCH_BB_SSE
- _mm_store_ps(&bb->lower[0], _mm_min_ps(_mm_load_ps(&bbj[0].lower[0]),
- _mm_load_ps(&bbj[1].lower[0])));
- _mm_store_ps(&bb->upper[0], _mm_max_ps(_mm_load_ps(&bbj[0].upper[0]),
- _mm_load_ps(&bbj[1].upper[0])));
+#ifdef NBNXN_SEARCH_BB_SIMD4
+ gmx_simd4_store_pr(&bb->lower[0],
+ gmx_simd4_min_pr(gmx_simd4_load_pr(&bbj[0].lower[0]),
+ gmx_simd4_load_pr(&bbj[1].lower[0])));
+ gmx_simd4_store_pr(&bb->upper[0],
+ gmx_simd4_max_pr(gmx_simd4_load_pr(&bbj[0].upper[0]),
+ gmx_simd4_load_pr(&bbj[1].upper[0])));
#else
{
int i;
#endif
}
-#ifdef NBNXN_SEARCH_BB_SSE
+#ifdef NBNXN_SEARCH_BB_SIMD4
/* Coordinate order xyz, bb order xxxxyyyyzzzz */
static void calc_bounding_box_xxxx(int na, int stride, const real *x, float *bb)
bb[5*STRIDE_PBB] = R2F_U(zh);
}
-#endif /* NBNXN_SEARCH_BB_SSE */
+#endif /* NBNXN_SEARCH_BB_SIMD4 */
-#ifdef NBNXN_SEARCH_SSE_SINGLE
+#ifdef NBNXN_SEARCH_SIMD4_FLOAT_X_BB
/* Coordinate order xyz?, bb order xyz0 */
-static void calc_bounding_box_sse(int na, const float *x, nbnxn_bb_t *bb)
+static void calc_bounding_box_simd4(int na, const float *x, nbnxn_bb_t *bb)
{
- __m128 bb_0_SSE, bb_1_SSE;
- __m128 x_SSE;
+ gmx_simd4_pr bb_0_S, bb_1_S;
+ gmx_simd4_pr x_S;
int i;
- bb_0_SSE = _mm_load_ps(x);
- bb_1_SSE = bb_0_SSE;
+ bb_0_S = gmx_simd4_load_pr(x);
+ bb_1_S = bb_0_S;
for (i = 1; i < na; i++)
{
- x_SSE = _mm_load_ps(x+i*NNBSBB_C);
- bb_0_SSE = _mm_min_ps(bb_0_SSE, x_SSE);
- bb_1_SSE = _mm_max_ps(bb_1_SSE, x_SSE);
+ x_S = gmx_simd4_load_pr(x+i*NNBSBB_C);
+ bb_0_S = gmx_simd4_min_pr(bb_0_S, x_S);
+ bb_1_S = gmx_simd4_max_pr(bb_1_S, x_S);
}
- _mm_store_ps(&bb->lower[0], bb_0_SSE);
- _mm_store_ps(&bb->upper[0], bb_1_SSE);
+ gmx_simd4_store_pr(&bb->lower[0], bb_0_S);
+ gmx_simd4_store_pr(&bb->upper[0], bb_1_S);
}
/* Coordinate order xyz?, bb order xxxxyyyyzzzz */
-static void calc_bounding_box_xxxx_sse(int na, const float *x,
- nbnxn_bb_t *bb_work_aligned,
- real *bb)
+static void calc_bounding_box_xxxx_simd4(int na, const float *x,
+ nbnxn_bb_t *bb_work_aligned,
+ real *bb)
{
- calc_bounding_box_sse(na, x, bb_work_aligned);
+ calc_bounding_box_simd4(na, x, bb_work_aligned);
bb[0*STRIDE_PBB] = bb_work_aligned->lower[BB_X];
bb[1*STRIDE_PBB] = bb_work_aligned->lower[BB_Y];
bb[5*STRIDE_PBB] = bb_work_aligned->upper[BB_Z];
}
-#endif /* NBNXN_SEARCH_SSE_SINGLE */
+#endif /* NBNXN_SEARCH_SIMD4_FLOAT_X_BB */
/* Combines pairs of consecutive bounding boxes */
nc2 = (grid->cxy_na[i]+3)>>(2+1);
for (c2 = sc2; c2 < sc2+nc2; c2++)
{
-#ifdef NBNXN_SEARCH_BB_SSE
- __m128 min_SSE, max_SSE;
-
- min_SSE = _mm_min_ps(_mm_load_ps(&bb[c2*2+0].lower[0]),
- _mm_load_ps(&bb[c2*2+1].lower[0]));
- max_SSE = _mm_max_ps(_mm_load_ps(&bb[c2*2+0].upper[0]),
- _mm_load_ps(&bb[c2*2+1].upper[0]));
- _mm_store_ps(&grid->bbj[c2].lower[0], min_SSE);
- _mm_store_ps(&grid->bbj[c2].upper[0], max_SSE);
+#ifdef NBNXN_SEARCH_BB_SIMD4
+ gmx_simd4_pr min_S, max_S;
+
+ min_S = gmx_simd4_min_pr(gmx_simd4_load_pr(&bb[c2*2+0].lower[0]),
+ gmx_simd4_load_pr(&bb[c2*2+1].lower[0]));
+ max_S = gmx_simd4_max_pr(gmx_simd4_load_pr(&bb[c2*2+0].upper[0]),
+ gmx_simd4_load_pr(&bb[c2*2+1].upper[0]));
+ gmx_simd4_store_pr(&grid->bbj[c2].lower[0], min_S);
+ gmx_simd4_store_pr(&grid->bbj[c2].upper[0], max_S);
#else
for (j = 0; j < NNBSBB_C; j++)
{
else if (!grid->bSimple)
{
/* Store the bounding boxes in a format convenient
- * for SSE calculations: xxxxyyyyzzzz...
+ * for SIMD4 calculations: xxxxyyyyzzzz...
*/
pbb_ptr =
grid->pbb +
((a0-grid->cell0*grid->na_sc)>>(grid->na_c_2log+STRIDE_PBB_2LOG))*NNBSBB_XXXX +
(((a0-grid->cell0*grid->na_sc)>>grid->na_c_2log) & (STRIDE_PBB-1));
-#ifdef NBNXN_SEARCH_SSE_SINGLE
+#ifdef NBNXN_SEARCH_SIMD4_FLOAT_X_BB
if (nbat->XFormat == nbatXYZQ)
{
- calc_bounding_box_xxxx_sse(na, nbat->x+a0*nbat->xstride,
- bb_work_aligned, pbb_ptr);
+ calc_bounding_box_xxxx_simd4(na, nbat->x+a0*nbat->xstride,
+ bb_work_aligned, pbb_ptr);
}
else
#endif
return d2;
}
-#ifdef NBNXN_SEARCH_BB_SSE
+#ifdef NBNXN_SEARCH_BB_SIMD4
-/* SSE code for bb distance for bb format xyz0 */
-static float subc_bb_dist2_sse(int si, const nbnxn_bb_t *bb_i_ci,
- int csj, const nbnxn_bb_t *bb_j_all)
+/* 4-wide SIMD code for bb distance for bb format xyz0 */
+static float subc_bb_dist2_simd4(int si, const nbnxn_bb_t *bb_i_ci,
+ int csj, const nbnxn_bb_t *bb_j_all)
{
- __m128 bb_i_SSE0, bb_i_SSE1;
- __m128 bb_j_SSE0, bb_j_SSE1;
- __m128 dl_SSE;
- __m128 dh_SSE;
- __m128 dm_SSE;
- __m128 dm0_SSE;
- __m128 d2_SSE;
-#ifndef GMX_X86_SSE4_1
- float d2_array[7], *d2_align;
-
- d2_align = (float *)(((size_t)(d2_array+3)) & (~((size_t)15)));
-#else
- float d2;
-#endif
-
- bb_i_SSE0 = _mm_load_ps(&bb_i_ci[si].lower[0]);
- bb_i_SSE1 = _mm_load_ps(&bb_i_ci[si].upper[0]);
- bb_j_SSE0 = _mm_load_ps(&bb_j_all[csj].lower[0]);
- bb_j_SSE1 = _mm_load_ps(&bb_j_all[csj].upper[0]);
-
- dl_SSE = _mm_sub_ps(bb_i_SSE0, bb_j_SSE1);
- dh_SSE = _mm_sub_ps(bb_j_SSE0, bb_i_SSE1);
+ gmx_simd4_pr bb_i_S0, bb_i_S1;
+ gmx_simd4_pr bb_j_S0, bb_j_S1;
+ gmx_simd4_pr dl_S;
+ gmx_simd4_pr dh_S;
+ gmx_simd4_pr dm_S;
+ gmx_simd4_pr dm0_S;
- dm_SSE = _mm_max_ps(dl_SSE, dh_SSE);
- dm0_SSE = _mm_max_ps(dm_SSE, _mm_setzero_ps());
-#ifndef GMX_X86_SSE4_1
- d2_SSE = _mm_mul_ps(dm0_SSE, dm0_SSE);
+ bb_i_S0 = gmx_simd4_load_pr(&bb_i_ci[si].lower[0]);
+ bb_i_S1 = gmx_simd4_load_pr(&bb_i_ci[si].upper[0]);
+ bb_j_S0 = gmx_simd4_load_pr(&bb_j_all[csj].lower[0]);
+ bb_j_S1 = gmx_simd4_load_pr(&bb_j_all[csj].upper[0]);
- _mm_store_ps(d2_align, d2_SSE);
+ dl_S = gmx_simd4_sub_pr(bb_i_S0, bb_j_S1);
+ dh_S = gmx_simd4_sub_pr(bb_j_S0, bb_i_S1);
- return d2_align[0] + d2_align[1] + d2_align[2];
-#else
- /* SSE4.1 dot product of components 0,1,2 */
- d2_SSE = _mm_dp_ps(dm0_SSE, dm0_SSE, 0x71);
-
- _mm_store_ss(&d2, d2_SSE);
+ dm_S = gmx_simd4_max_pr(dl_S, dh_S);
+ dm0_S = gmx_simd4_max_pr(dm_S, gmx_simd4_setzero_pr());
- return d2;
-#endif
+ return gmx_simd4_dotproduct3(dm0_S, dm0_S);
}
/* Calculate bb bounding distances of bb_i[si,...,si+3] and store them in d2 */
-#define SUBC_BB_DIST2_SSE_XXXX_INNER(si, bb_i, d2) \
+#define SUBC_BB_DIST2_SIMD4_XXXX_INNER(si, bb_i, d2) \
{ \
int shi; \
\
- __m128 dx_0, dy_0, dz_0; \
- __m128 dx_1, dy_1, dz_1; \
+ gmx_simd4_pr dx_0, dy_0, dz_0; \
+ gmx_simd4_pr dx_1, dy_1, dz_1; \
\
- __m128 mx, my, mz; \
- __m128 m0x, m0y, m0z; \
+ gmx_simd4_pr mx, my, mz; \
+ gmx_simd4_pr m0x, m0y, m0z; \
\
- __m128 d2x, d2y, d2z; \
- __m128 d2s, d2t; \
+ gmx_simd4_pr d2x, d2y, d2z; \
+ gmx_simd4_pr d2s, d2t; \
\
shi = si*NNBSBB_D*DIM; \
\
- xi_l = _mm_load_ps(bb_i+shi+0*STRIDE_PBB); \
- yi_l = _mm_load_ps(bb_i+shi+1*STRIDE_PBB); \
- zi_l = _mm_load_ps(bb_i+shi+2*STRIDE_PBB); \
- xi_h = _mm_load_ps(bb_i+shi+3*STRIDE_PBB); \
- yi_h = _mm_load_ps(bb_i+shi+4*STRIDE_PBB); \
- zi_h = _mm_load_ps(bb_i+shi+5*STRIDE_PBB); \
+ xi_l = gmx_simd4_load_pr(bb_i+shi+0*STRIDE_PBB); \
+ yi_l = gmx_simd4_load_pr(bb_i+shi+1*STRIDE_PBB); \
+ zi_l = gmx_simd4_load_pr(bb_i+shi+2*STRIDE_PBB); \
+ xi_h = gmx_simd4_load_pr(bb_i+shi+3*STRIDE_PBB); \
+ yi_h = gmx_simd4_load_pr(bb_i+shi+4*STRIDE_PBB); \
+ zi_h = gmx_simd4_load_pr(bb_i+shi+5*STRIDE_PBB); \
\
- dx_0 = _mm_sub_ps(xi_l, xj_h); \
- dy_0 = _mm_sub_ps(yi_l, yj_h); \
- dz_0 = _mm_sub_ps(zi_l, zj_h); \
+ dx_0 = gmx_simd4_sub_pr(xi_l, xj_h); \
+ dy_0 = gmx_simd4_sub_pr(yi_l, yj_h); \
+ dz_0 = gmx_simd4_sub_pr(zi_l, zj_h); \
\
- dx_1 = _mm_sub_ps(xj_l, xi_h); \
- dy_1 = _mm_sub_ps(yj_l, yi_h); \
- dz_1 = _mm_sub_ps(zj_l, zi_h); \
+ dx_1 = gmx_simd4_sub_pr(xj_l, xi_h); \
+ dy_1 = gmx_simd4_sub_pr(yj_l, yi_h); \
+ dz_1 = gmx_simd4_sub_pr(zj_l, zi_h); \
\
- mx = _mm_max_ps(dx_0, dx_1); \
- my = _mm_max_ps(dy_0, dy_1); \
- mz = _mm_max_ps(dz_0, dz_1); \
+ mx = gmx_simd4_max_pr(dx_0, dx_1); \
+ my = gmx_simd4_max_pr(dy_0, dy_1); \
+ mz = gmx_simd4_max_pr(dz_0, dz_1); \
\
- m0x = _mm_max_ps(mx, zero); \
- m0y = _mm_max_ps(my, zero); \
- m0z = _mm_max_ps(mz, zero); \
+ m0x = gmx_simd4_max_pr(mx, zero); \
+ m0y = gmx_simd4_max_pr(my, zero); \
+ m0z = gmx_simd4_max_pr(mz, zero); \
\
- d2x = _mm_mul_ps(m0x, m0x); \
- d2y = _mm_mul_ps(m0y, m0y); \
- d2z = _mm_mul_ps(m0z, m0z); \
+ d2x = gmx_simd4_mul_pr(m0x, m0x); \
+ d2y = gmx_simd4_mul_pr(m0y, m0y); \
+ d2z = gmx_simd4_mul_pr(m0z, m0z); \
\
- d2s = _mm_add_ps(d2x, d2y); \
- d2t = _mm_add_ps(d2s, d2z); \
+ d2s = gmx_simd4_add_pr(d2x, d2y); \
+ d2t = gmx_simd4_add_pr(d2s, d2z); \
\
- _mm_store_ps(d2+si, d2t); \
+ gmx_simd4_store_pr(d2+si, d2t); \
}
-/* SSE code for nsi bb distances for bb format xxxxyyyyzzzz */
-static void subc_bb_dist2_sse_xxxx(const float *bb_j,
- int nsi, const float *bb_i,
- float *d2)
+/* 4-wide SIMD code for nsi bb distances for bb format xxxxyyyyzzzz */
+static void subc_bb_dist2_simd4_xxxx(const float *bb_j,
+ int nsi, const float *bb_i,
+ float *d2)
{
- __m128 xj_l, yj_l, zj_l;
- __m128 xj_h, yj_h, zj_h;
- __m128 xi_l, yi_l, zi_l;
- __m128 xi_h, yi_h, zi_h;
+ gmx_simd4_pr xj_l, yj_l, zj_l;
+ gmx_simd4_pr xj_h, yj_h, zj_h;
+ gmx_simd4_pr xi_l, yi_l, zi_l;
+ gmx_simd4_pr xi_h, yi_h, zi_h;
- __m128 zero;
+ gmx_simd4_pr zero;
- zero = _mm_setzero_ps();
+ zero = gmx_simd4_setzero_pr();
- xj_l = _mm_set1_ps(bb_j[0*STRIDE_PBB]);
- yj_l = _mm_set1_ps(bb_j[1*STRIDE_PBB]);
- zj_l = _mm_set1_ps(bb_j[2*STRIDE_PBB]);
- xj_h = _mm_set1_ps(bb_j[3*STRIDE_PBB]);
- yj_h = _mm_set1_ps(bb_j[4*STRIDE_PBB]);
- zj_h = _mm_set1_ps(bb_j[5*STRIDE_PBB]);
+ xj_l = gmx_simd4_set1_pr(bb_j[0*STRIDE_PBB]);
+ yj_l = gmx_simd4_set1_pr(bb_j[1*STRIDE_PBB]);
+ zj_l = gmx_simd4_set1_pr(bb_j[2*STRIDE_PBB]);
+ xj_h = gmx_simd4_set1_pr(bb_j[3*STRIDE_PBB]);
+ yj_h = gmx_simd4_set1_pr(bb_j[4*STRIDE_PBB]);
+ zj_h = gmx_simd4_set1_pr(bb_j[5*STRIDE_PBB]);
/* Here we "loop" over si (0,STRIDE_PBB) from 0 to nsi with step STRIDE_PBB.
* But as we know the number of iterations is 1 or 2, we unroll manually.
*/
- SUBC_BB_DIST2_SSE_XXXX_INNER(0, bb_i, d2);
+ SUBC_BB_DIST2_SIMD4_XXXX_INNER(0, bb_i, d2);
if (STRIDE_PBB < nsi)
{
- SUBC_BB_DIST2_SSE_XXXX_INNER(STRIDE_PBB, bb_i, d2);
+ SUBC_BB_DIST2_SIMD4_XXXX_INNER(STRIDE_PBB, bb_i, d2);
}
}
-#endif /* NBNXN_SEARCH_BB_SSE */
+#endif /* NBNXN_SEARCH_BB_SIMD4 */
/* Plain C function which determines if any atom pair between two cells
* is within distance sqrt(rl2).
return FALSE;
}
-#ifdef NBNXN_SEARCH_SSE_SINGLE
+#ifdef NBNXN_SEARCH_SIMD4_FLOAT_X_BB
/* When we make seperate single/double precision SIMD vector operation
* include files, this function should be moved there (also using FMA).
*/
-static inline __m128
-gmx_mm_calc_rsq_ps(__m128 x, __m128 y, __m128 z)
+static inline gmx_simd4_pr
+gmx_simd4_calc_rsq_pr(gmx_simd4_pr x, gmx_simd4_pr y, gmx_simd4_pr z)
{
- return _mm_add_ps( _mm_add_ps( _mm_mul_ps(x, x), _mm_mul_ps(y, y) ), _mm_mul_ps(z, z) );
+ return gmx_simd4_add_pr( gmx_simd4_add_pr( gmx_simd4_mul_pr(x, x), gmx_simd4_mul_pr(y, y) ), gmx_simd4_mul_pr(z, z) );
}
#endif
-/* SSE function which determines if any atom pair between two cells,
+/* 4-wide SIMD function which determines if any atom pair between two cells,
* both with 8 atoms, is within distance sqrt(rl2).
- * Not performance critical, so only uses plain SSE.
+ * Using 8-wide AVX is not faster on Intel Sandy Bridge.
*/
-static gmx_bool subc_in_range_sse8(int na_c,
- int si, const real *x_i,
- int csj, int stride, const real *x_j,
- real rl2)
+static gmx_bool subc_in_range_simd4(int na_c,
+ int si, const real *x_i,
+ int csj, int stride, const real *x_j,
+ real rl2)
{
-#ifdef NBNXN_SEARCH_SSE_SINGLE
- __m128 ix_SSE0, iy_SSE0, iz_SSE0;
- __m128 ix_SSE1, iy_SSE1, iz_SSE1;
+#ifdef NBNXN_SEARCH_SIMD4_FLOAT_X_BB
+ gmx_simd4_pr ix_S0, iy_S0, iz_S0;
+ gmx_simd4_pr ix_S1, iy_S1, iz_S1;
- __m128 rc2_SSE;
+ gmx_simd4_pr rc2_S;
- int na_c_sse;
+ int dim_stride;
int j0, j1;
- rc2_SSE = _mm_set1_ps(rl2);
+ rc2_S = gmx_simd4_set1_pr(rl2);
- na_c_sse = NBNXN_GPU_CLUSTER_SIZE/STRIDE_PBB;
- ix_SSE0 = _mm_load_ps(x_i+(si*na_c_sse*DIM+0)*STRIDE_PBB);
- iy_SSE0 = _mm_load_ps(x_i+(si*na_c_sse*DIM+1)*STRIDE_PBB);
- iz_SSE0 = _mm_load_ps(x_i+(si*na_c_sse*DIM+2)*STRIDE_PBB);
- ix_SSE1 = _mm_load_ps(x_i+(si*na_c_sse*DIM+3)*STRIDE_PBB);
- iy_SSE1 = _mm_load_ps(x_i+(si*na_c_sse*DIM+4)*STRIDE_PBB);
- iz_SSE1 = _mm_load_ps(x_i+(si*na_c_sse*DIM+5)*STRIDE_PBB);
+ dim_stride = NBNXN_GPU_CLUSTER_SIZE/STRIDE_PBB*DIM;
+ ix_S0 = gmx_simd4_load_pr(x_i+(si*dim_stride+0)*STRIDE_PBB);
+ iy_S0 = gmx_simd4_load_pr(x_i+(si*dim_stride+1)*STRIDE_PBB);
+ iz_S0 = gmx_simd4_load_pr(x_i+(si*dim_stride+2)*STRIDE_PBB);
+ ix_S1 = gmx_simd4_load_pr(x_i+(si*dim_stride+3)*STRIDE_PBB);
+ iy_S1 = gmx_simd4_load_pr(x_i+(si*dim_stride+4)*STRIDE_PBB);
+ iz_S1 = gmx_simd4_load_pr(x_i+(si*dim_stride+5)*STRIDE_PBB);
/* We loop from the outer to the inner particles to maximize
* the chance that we find a pair in range quickly and return.
j1 = j0 + na_c - 1;
while (j0 < j1)
{
- __m128 jx0_SSE, jy0_SSE, jz0_SSE;
- __m128 jx1_SSE, jy1_SSE, jz1_SSE;
+ gmx_simd4_pr jx0_S, jy0_S, jz0_S;
+ gmx_simd4_pr jx1_S, jy1_S, jz1_S;
- __m128 dx_SSE0, dy_SSE0, dz_SSE0;
- __m128 dx_SSE1, dy_SSE1, dz_SSE1;
- __m128 dx_SSE2, dy_SSE2, dz_SSE2;
- __m128 dx_SSE3, dy_SSE3, dz_SSE3;
+ gmx_simd4_pr dx_S0, dy_S0, dz_S0;
+ gmx_simd4_pr dx_S1, dy_S1, dz_S1;
+ gmx_simd4_pr dx_S2, dy_S2, dz_S2;
+ gmx_simd4_pr dx_S3, dy_S3, dz_S3;
- __m128 rsq_SSE0;
- __m128 rsq_SSE1;
- __m128 rsq_SSE2;
- __m128 rsq_SSE3;
+ gmx_simd4_pr rsq_S0;
+ gmx_simd4_pr rsq_S1;
+ gmx_simd4_pr rsq_S2;
+ gmx_simd4_pr rsq_S3;
- __m128 wco_SSE0;
- __m128 wco_SSE1;
- __m128 wco_SSE2;
- __m128 wco_SSE3;
- __m128 wco_any_SSE01, wco_any_SSE23, wco_any_SSE;
+ gmx_simd4_pb wco_S0;
+ gmx_simd4_pb wco_S1;
+ gmx_simd4_pb wco_S2;
+ gmx_simd4_pb wco_S3;
+ gmx_simd4_pb wco_any_S01, wco_any_S23, wco_any_S;
- jx0_SSE = _mm_load1_ps(x_j+j0*stride+0);
- jy0_SSE = _mm_load1_ps(x_j+j0*stride+1);
- jz0_SSE = _mm_load1_ps(x_j+j0*stride+2);
+ jx0_S = gmx_simd4_set1_pr(x_j[j0*stride+0]);
+ jy0_S = gmx_simd4_set1_pr(x_j[j0*stride+1]);
+ jz0_S = gmx_simd4_set1_pr(x_j[j0*stride+2]);
- jx1_SSE = _mm_load1_ps(x_j+j1*stride+0);
- jy1_SSE = _mm_load1_ps(x_j+j1*stride+1);
- jz1_SSE = _mm_load1_ps(x_j+j1*stride+2);
+ jx1_S = gmx_simd4_set1_pr(x_j[j1*stride+0]);
+ jy1_S = gmx_simd4_set1_pr(x_j[j1*stride+1]);
+ jz1_S = gmx_simd4_set1_pr(x_j[j1*stride+2]);
/* Calculate distance */
- dx_SSE0 = _mm_sub_ps(ix_SSE0, jx0_SSE);
- dy_SSE0 = _mm_sub_ps(iy_SSE0, jy0_SSE);
- dz_SSE0 = _mm_sub_ps(iz_SSE0, jz0_SSE);
- dx_SSE1 = _mm_sub_ps(ix_SSE1, jx0_SSE);
- dy_SSE1 = _mm_sub_ps(iy_SSE1, jy0_SSE);
- dz_SSE1 = _mm_sub_ps(iz_SSE1, jz0_SSE);
- dx_SSE2 = _mm_sub_ps(ix_SSE0, jx1_SSE);
- dy_SSE2 = _mm_sub_ps(iy_SSE0, jy1_SSE);
- dz_SSE2 = _mm_sub_ps(iz_SSE0, jz1_SSE);
- dx_SSE3 = _mm_sub_ps(ix_SSE1, jx1_SSE);
- dy_SSE3 = _mm_sub_ps(iy_SSE1, jy1_SSE);
- dz_SSE3 = _mm_sub_ps(iz_SSE1, jz1_SSE);
+ dx_S0 = gmx_simd4_sub_pr(ix_S0, jx0_S);
+ dy_S0 = gmx_simd4_sub_pr(iy_S0, jy0_S);
+ dz_S0 = gmx_simd4_sub_pr(iz_S0, jz0_S);
+ dx_S1 = gmx_simd4_sub_pr(ix_S1, jx0_S);
+ dy_S1 = gmx_simd4_sub_pr(iy_S1, jy0_S);
+ dz_S1 = gmx_simd4_sub_pr(iz_S1, jz0_S);
+ dx_S2 = gmx_simd4_sub_pr(ix_S0, jx1_S);
+ dy_S2 = gmx_simd4_sub_pr(iy_S0, jy1_S);
+ dz_S2 = gmx_simd4_sub_pr(iz_S0, jz1_S);
+ dx_S3 = gmx_simd4_sub_pr(ix_S1, jx1_S);
+ dy_S3 = gmx_simd4_sub_pr(iy_S1, jy1_S);
+ dz_S3 = gmx_simd4_sub_pr(iz_S1, jz1_S);
/* rsq = dx*dx+dy*dy+dz*dz */
- rsq_SSE0 = gmx_mm_calc_rsq_ps(dx_SSE0, dy_SSE0, dz_SSE0);
- rsq_SSE1 = gmx_mm_calc_rsq_ps(dx_SSE1, dy_SSE1, dz_SSE1);
- rsq_SSE2 = gmx_mm_calc_rsq_ps(dx_SSE2, dy_SSE2, dz_SSE2);
- rsq_SSE3 = gmx_mm_calc_rsq_ps(dx_SSE3, dy_SSE3, dz_SSE3);
+ rsq_S0 = gmx_simd4_calc_rsq_pr(dx_S0, dy_S0, dz_S0);
+ rsq_S1 = gmx_simd4_calc_rsq_pr(dx_S1, dy_S1, dz_S1);
+ rsq_S2 = gmx_simd4_calc_rsq_pr(dx_S2, dy_S2, dz_S2);
+ rsq_S3 = gmx_simd4_calc_rsq_pr(dx_S3, dy_S3, dz_S3);
- wco_SSE0 = _mm_cmplt_ps(rsq_SSE0, rc2_SSE);
- wco_SSE1 = _mm_cmplt_ps(rsq_SSE1, rc2_SSE);
- wco_SSE2 = _mm_cmplt_ps(rsq_SSE2, rc2_SSE);
- wco_SSE3 = _mm_cmplt_ps(rsq_SSE3, rc2_SSE);
+ wco_S0 = gmx_simd4_cmplt_pr(rsq_S0, rc2_S);
+ wco_S1 = gmx_simd4_cmplt_pr(rsq_S1, rc2_S);
+ wco_S2 = gmx_simd4_cmplt_pr(rsq_S2, rc2_S);
+ wco_S3 = gmx_simd4_cmplt_pr(rsq_S3, rc2_S);
- wco_any_SSE01 = _mm_or_ps(wco_SSE0, wco_SSE1);
- wco_any_SSE23 = _mm_or_ps(wco_SSE2, wco_SSE3);
- wco_any_SSE = _mm_or_ps(wco_any_SSE01, wco_any_SSE23);
+ wco_any_S01 = gmx_simd4_or_pb(wco_S0, wco_S1);
+ wco_any_S23 = gmx_simd4_or_pb(wco_S2, wco_S3);
+ wco_any_S = gmx_simd4_or_pb(wco_any_S01, wco_any_S23);
- if (_mm_movemask_ps(wco_any_SSE))
+ if (gmx_simd4_anytrue_pb(wco_any_S))
{
return TRUE;
}
return FALSE;
#else
- /* No SSE */
- gmx_incons("SSE function called without SSE support");
+ /* No SIMD4 */
+ gmx_incons("SIMD4 function called without 4-wide SIMD support");
return TRUE;
#endif
snew(nbl->work, 1);
if (nbl->bSimple)
{
- snew_aligned(nbl->work->bb_ci, 1, NBNXN_MEM_ALIGN);
+ snew_aligned(nbl->work->bb_ci, 1, NBNXN_SEARCH_BB_MEM_ALIGN);
}
else
{
#ifdef NBNXN_BBXXXX
- snew_aligned(nbl->work->pbb_ci, GPU_NSUBCELL/STRIDE_PBB*NNBSBB_XXXX, NBNXN_MEM_ALIGN);
+ snew_aligned(nbl->work->pbb_ci, GPU_NSUBCELL/STRIDE_PBB*NNBSBB_XXXX, NBNXN_SEARCH_BB_MEM_ALIGN);
#else
- snew_aligned(nbl->work->bb_ci, GPU_NSUBCELL, NBNXN_MEM_ALIGN);
+ snew_aligned(nbl->work->bb_ci, GPU_NSUBCELL, NBNXN_SEARCH_BB_MEM_ALIGN);
#endif
}
- snew_aligned(nbl->work->x_ci, NBNXN_NA_SC_MAX*DIM, NBNXN_MEM_ALIGN);
+ snew_aligned(nbl->work->x_ci, NBNXN_NA_SC_MAX*DIM, NBNXN_SEARCH_BB_MEM_ALIGN);
#ifdef GMX_NBNXN_SIMD
snew_aligned(nbl->work->x_ci_simd_4xn, 1, NBNXN_MEM_ALIGN);
snew_aligned(nbl->work->x_ci_simd_2xnn, 1, NBNXN_MEM_ALIGN);
#endif
- snew_aligned(nbl->work->d2, GPU_NSUBCELL, NBNXN_MEM_ALIGN);
+ snew_aligned(nbl->work->d2, GPU_NSUBCELL, NBNXN_SEARCH_BB_MEM_ALIGN);
nbl->work->sort = NULL;
nbl->work->sort_nalloc = 0;
#include "nbnxn_search_simd_2xnn.h"
#endif
-/* Plain C or SSE code for making a pair list of super-cell sci vs scj.
+/* Plain C or SIMD4 code for making a pair list of super-cell sci vs scj.
* Checks bounding box distances and possibly atom pair distances.
*/
static void make_cluster_list_supersub(const nbnxn_search_t nbs,
}
#ifdef NBNXN_BBXXXX
- /* Determine all ci1 bb distances in one call with SSE */
- subc_bb_dist2_sse_xxxx(gridj->pbb+(cj>>STRIDE_PBB_2LOG)*NNBSBB_XXXX+(cj & (STRIDE_PBB-1)),
+ /* Determine all ci1 bb distances in one call with SIMD4 */
+ subc_bb_dist2_simd4_xxxx(gridj->pbb+(cj>>STRIDE_PBB_2LOG)*NNBSBB_XXXX+(cj & (STRIDE_PBB-1)),
ci1, pbb_ci, d2l);
*ndistc += na_c*2;
#endif
*ndistc += na_c*na_c;
if (d2 < rbb2 ||
(d2 < rl2 &&
-#ifdef NBNXN_PBB_SSE
- subc_in_range_sse8
+#ifdef NBNXN_PBB_SIMD4
+ subc_in_range_simd4
#else
subc_in_range_x
#endif
{
/* Avoid using function pointers here, as it's slower */
if (
-#ifdef NBNXN_PBB_SSE
- !subc_in_range_sse8
+#ifdef NBNXN_PBB_SIMD4
+ !subc_in_range_simd4
#else
!subc_in_range_x
#endif
ndirect++;
}
}
-#ifdef NBNXN_SEARCH_BB_SSE
+#ifdef NBNXN_SEARCH_BB_SIMD4
else
{
while (cj_ind_first + ndirect <= cj_ind_last &&
}
}
-#ifdef NBNXN_SEARCH_BB_SSE
+#ifdef NBNXN_SEARCH_BB_SIMD4
/* Copies PBC shifted super-cell packed atom coordinates to working array */
-static void icell_set_x_supersub_sse8(int ci,
- real shx, real shy, real shz,
- int na_c,
- int stride, const real *x,
- nbnxn_list_work_t *work)
+static void icell_set_x_supersub_simd4(int ci,
+ real shx, real shy, real shz,
+ int na_c,
+ int stride, const real *x,
+ nbnxn_list_work_t *work)
{
int si, io, ia, i, j;
real *x_ci;
}
else
{
-#ifdef NBNXN_SEARCH_BB_SSE
- nbs->icell_set_x = icell_set_x_supersub_sse8;
+#ifdef NBNXN_SEARCH_BB_SIMD4
+ nbs->icell_set_x = icell_set_x_supersub_simd4;
#else
nbs->icell_set_x = icell_set_x_supersub;
#endif
ia = X_IND_CI_SIMD_2XNN(ci);
- x_ci->ix_SSE0 = gmx_set_2real_shift_pr(x + ia + 0*STRIDE_S + 0, shx);
- x_ci->iy_SSE0 = gmx_set_2real_shift_pr(x + ia + 1*STRIDE_S + 0, shy);
- x_ci->iz_SSE0 = gmx_set_2real_shift_pr(x + ia + 2*STRIDE_S + 0, shz);
- x_ci->ix_SSE2 = gmx_set_2real_shift_pr(x + ia + 0*STRIDE_S + 2, shx);
- x_ci->iy_SSE2 = gmx_set_2real_shift_pr(x + ia + 1*STRIDE_S + 2, shy);
- x_ci->iz_SSE2 = gmx_set_2real_shift_pr(x + ia + 2*STRIDE_S + 2, shz);
+ x_ci->ix_S0 = gmx_set_2real_shift_pr(x + ia + 0*STRIDE_S + 0, shx);
+ x_ci->iy_S0 = gmx_set_2real_shift_pr(x + ia + 1*STRIDE_S + 0, shy);
+ x_ci->iz_S0 = gmx_set_2real_shift_pr(x + ia + 2*STRIDE_S + 0, shz);
+ x_ci->ix_S2 = gmx_set_2real_shift_pr(x + ia + 0*STRIDE_S + 2, shx);
+ x_ci->iy_S2 = gmx_set_2real_shift_pr(x + ia + 1*STRIDE_S + 2, shy);
+ x_ci->iz_S2 = gmx_set_2real_shift_pr(x + ia + 2*STRIDE_S + 2, shz);
}
-#ifndef GMX_SIMD_HAVE_ANYTRUE
-/* Fallback function in case gmx_anytrue_pr is not present */
-static gmx_inline gmx_bool
-gmx_anytrue_2xn_pb(gmx_mm_pb bool_S)
-{
- real bools_array[2*GMX_SIMD_WIDTH_HERE], *bools;
- gmx_bool any;
- int s;
-
- bools = gmx_simd_align_real(bools_array);
-
- gmx_store_pb(bools, bool_S);
-
- any = FALSE;
- for (s = 0; s < GMX_SIMD_WIDTH_HERE; s++)
- {
- if (GMX_SIMD_IS_TRUE(s))
- {
- any = TRUE;
- }
- }
-
- return any;
-}
-#endif
-
/* SIMD code for making a pair list of cell ci vs cell cjf-cjl
* for coordinates in packed format.
* Checks bouding box distances and possibly atom pair distances.
const nbnxn_x_ci_simd_2xnn_t *work;
const nbnxn_bb_t *bb_ci;
- gmx_mm_pr jx_SSE, jy_SSE, jz_SSE;
+ gmx_mm_pr jx_S, jy_S, jz_S;
- gmx_mm_pr dx_SSE0, dy_SSE0, dz_SSE0;
- gmx_mm_pr dx_SSE2, dy_SSE2, dz_SSE2;
+ gmx_mm_pr dx_S0, dy_S0, dz_S0;
+ gmx_mm_pr dx_S2, dy_S2, dz_S2;
- gmx_mm_pr rsq_SSE0;
- gmx_mm_pr rsq_SSE2;
+ gmx_mm_pr rsq_S0;
+ gmx_mm_pr rsq_S2;
- gmx_mm_pb wco_SSE0;
- gmx_mm_pb wco_SSE2;
- gmx_mm_pb wco_any_SSE;
+ gmx_mm_pb wco_S0;
+ gmx_mm_pb wco_S2;
+ gmx_mm_pb wco_any_S;
- gmx_mm_pr rc2_SSE;
+ gmx_mm_pr rc2_S;
gmx_bool InRange;
float d2;
bb_ci = nbl->work->bb_ci;
- rc2_SSE = gmx_set1_pr(rl2);
+ rc2_S = gmx_set1_pr(rl2);
InRange = FALSE;
while (!InRange && cjf <= cjl)
{
-#ifdef NBNXN_SEARCH_BB_SSE
- d2 = subc_bb_dist2_sse(0, bb_ci, cjf, gridj->bbj);
+#ifdef NBNXN_SEARCH_BB_SIMD4
+ d2 = subc_bb_dist2_simd4(0, bb_ci, cjf, gridj->bbj);
#else
d2 = subc_bb_dist2(0, bb_ci, cjf, gridj->bbj);
#endif
{
xind_f = X_IND_CJ_SIMD_2XNN(CI_TO_CJ_SIMD_2XNN(gridj->cell0) + cjf);
- jx_SSE = gmx_load_hpr_hilo_pr(x_j+xind_f+0*STRIDE_S);
- jy_SSE = gmx_load_hpr_hilo_pr(x_j+xind_f+1*STRIDE_S);
- jz_SSE = gmx_load_hpr_hilo_pr(x_j+xind_f+2*STRIDE_S);
+ jx_S = gmx_load_hpr_hilo_pr(x_j+xind_f+0*STRIDE_S);
+ jy_S = gmx_load_hpr_hilo_pr(x_j+xind_f+1*STRIDE_S);
+ jz_S = gmx_load_hpr_hilo_pr(x_j+xind_f+2*STRIDE_S);
/* Calculate distance */
- dx_SSE0 = gmx_sub_pr(work->ix_SSE0, jx_SSE);
- dy_SSE0 = gmx_sub_pr(work->iy_SSE0, jy_SSE);
- dz_SSE0 = gmx_sub_pr(work->iz_SSE0, jz_SSE);
- dx_SSE2 = gmx_sub_pr(work->ix_SSE2, jx_SSE);
- dy_SSE2 = gmx_sub_pr(work->iy_SSE2, jy_SSE);
- dz_SSE2 = gmx_sub_pr(work->iz_SSE2, jz_SSE);
+ dx_S0 = gmx_sub_pr(work->ix_S0, jx_S);
+ dy_S0 = gmx_sub_pr(work->iy_S0, jy_S);
+ dz_S0 = gmx_sub_pr(work->iz_S0, jz_S);
+ dx_S2 = gmx_sub_pr(work->ix_S2, jx_S);
+ dy_S2 = gmx_sub_pr(work->iy_S2, jy_S);
+ dz_S2 = gmx_sub_pr(work->iz_S2, jz_S);
/* rsq = dx*dx+dy*dy+dz*dz */
- rsq_SSE0 = gmx_calc_rsq_pr(dx_SSE0, dy_SSE0, dz_SSE0);
- rsq_SSE2 = gmx_calc_rsq_pr(dx_SSE2, dy_SSE2, dz_SSE2);
+ rsq_S0 = gmx_calc_rsq_pr(dx_S0, dy_S0, dz_S0);
+ rsq_S2 = gmx_calc_rsq_pr(dx_S2, dy_S2, dz_S2);
- wco_SSE0 = gmx_cmplt_pr(rsq_SSE0, rc2_SSE);
- wco_SSE2 = gmx_cmplt_pr(rsq_SSE2, rc2_SSE);
+ wco_S0 = gmx_cmplt_pr(rsq_S0, rc2_S);
+ wco_S2 = gmx_cmplt_pr(rsq_S2, rc2_S);
- wco_any_SSE = gmx_or_pb(wco_SSE0, wco_SSE2);
+ wco_any_S = gmx_or_pb(wco_S0, wco_S2);
-#ifdef GMX_SIMD_HAVE_ANYTRUE
- InRange = gmx_anytrue_pb(wco_any_SSE);
-#else
- InRange = gmx_anytrue_2xn_pb(wco_any_SSE);
-#endif
+ InRange = gmx_anytrue_pb(wco_any_S);
*ndistc += 2*GMX_SIMD_WIDTH_HERE;
}
InRange = FALSE;
while (!InRange && cjl > cjf)
{
-#ifdef NBNXN_SEARCH_BB_SSE
- d2 = subc_bb_dist2_sse(0, bb_ci, cjl, gridj->bbj);
+#ifdef NBNXN_SEARCH_BB_SIMD4
+ d2 = subc_bb_dist2_simd4(0, bb_ci, cjl, gridj->bbj);
#else
d2 = subc_bb_dist2(0, bb_ci, cjl, gridj->bbj);
#endif
{
xind_l = X_IND_CJ_SIMD_2XNN(CI_TO_CJ_SIMD_2XNN(gridj->cell0) + cjl);
- jx_SSE = gmx_load_hpr_hilo_pr(x_j+xind_l+0*STRIDE_S);
- jy_SSE = gmx_load_hpr_hilo_pr(x_j+xind_l+1*STRIDE_S);
- jz_SSE = gmx_load_hpr_hilo_pr(x_j+xind_l+2*STRIDE_S);
+ jx_S = gmx_load_hpr_hilo_pr(x_j+xind_l+0*STRIDE_S);
+ jy_S = gmx_load_hpr_hilo_pr(x_j+xind_l+1*STRIDE_S);
+ jz_S = gmx_load_hpr_hilo_pr(x_j+xind_l+2*STRIDE_S);
/* Calculate distance */
- dx_SSE0 = gmx_sub_pr(work->ix_SSE0, jx_SSE);
- dy_SSE0 = gmx_sub_pr(work->iy_SSE0, jy_SSE);
- dz_SSE0 = gmx_sub_pr(work->iz_SSE0, jz_SSE);
- dx_SSE2 = gmx_sub_pr(work->ix_SSE2, jx_SSE);
- dy_SSE2 = gmx_sub_pr(work->iy_SSE2, jy_SSE);
- dz_SSE2 = gmx_sub_pr(work->iz_SSE2, jz_SSE);
+ dx_S0 = gmx_sub_pr(work->ix_S0, jx_S);
+ dy_S0 = gmx_sub_pr(work->iy_S0, jy_S);
+ dz_S0 = gmx_sub_pr(work->iz_S0, jz_S);
+ dx_S2 = gmx_sub_pr(work->ix_S2, jx_S);
+ dy_S2 = gmx_sub_pr(work->iy_S2, jy_S);
+ dz_S2 = gmx_sub_pr(work->iz_S2, jz_S);
/* rsq = dx*dx+dy*dy+dz*dz */
- rsq_SSE0 = gmx_calc_rsq_pr(dx_SSE0, dy_SSE0, dz_SSE0);
- rsq_SSE2 = gmx_calc_rsq_pr(dx_SSE2, dy_SSE2, dz_SSE2);
+ rsq_S0 = gmx_calc_rsq_pr(dx_S0, dy_S0, dz_S0);
+ rsq_S2 = gmx_calc_rsq_pr(dx_S2, dy_S2, dz_S2);
- wco_SSE0 = gmx_cmplt_pr(rsq_SSE0, rc2_SSE);
- wco_SSE2 = gmx_cmplt_pr(rsq_SSE2, rc2_SSE);
+ wco_S0 = gmx_cmplt_pr(rsq_S0, rc2_S);
+ wco_S2 = gmx_cmplt_pr(rsq_S2, rc2_S);
- wco_any_SSE = gmx_or_pb(wco_SSE0, wco_SSE2);
+ wco_any_S = gmx_or_pb(wco_S0, wco_S2);
-#ifdef GMX_SIMD_HAVE_ANYTRUE
- InRange = gmx_anytrue_pb(wco_any_SSE);
-#else
- InRange = gmx_anytrue_2xn_pb(wco_any_SSE);
-#endif
+ InRange = gmx_anytrue_pb(wco_any_S);
*ndistc += 2*GMX_SIMD_WIDTH_HERE;
}
ia = X_IND_CI_SIMD_4XN(ci);
- x_ci->ix_SSE0 = gmx_set1_pr(x[ia + 0*STRIDE_S ] + shx);
- x_ci->iy_SSE0 = gmx_set1_pr(x[ia + 1*STRIDE_S ] + shy);
- x_ci->iz_SSE0 = gmx_set1_pr(x[ia + 2*STRIDE_S ] + shz);
- x_ci->ix_SSE1 = gmx_set1_pr(x[ia + 0*STRIDE_S + 1] + shx);
- x_ci->iy_SSE1 = gmx_set1_pr(x[ia + 1*STRIDE_S + 1] + shy);
- x_ci->iz_SSE1 = gmx_set1_pr(x[ia + 2*STRIDE_S + 1] + shz);
- x_ci->ix_SSE2 = gmx_set1_pr(x[ia + 0*STRIDE_S + 2] + shx);
- x_ci->iy_SSE2 = gmx_set1_pr(x[ia + 1*STRIDE_S + 2] + shy);
- x_ci->iz_SSE2 = gmx_set1_pr(x[ia + 2*STRIDE_S + 2] + shz);
- x_ci->ix_SSE3 = gmx_set1_pr(x[ia + 0*STRIDE_S + 3] + shx);
- x_ci->iy_SSE3 = gmx_set1_pr(x[ia + 1*STRIDE_S + 3] + shy);
- x_ci->iz_SSE3 = gmx_set1_pr(x[ia + 2*STRIDE_S + 3] + shz);
+ x_ci->ix_S0 = gmx_set1_pr(x[ia + 0*STRIDE_S ] + shx);
+ x_ci->iy_S0 = gmx_set1_pr(x[ia + 1*STRIDE_S ] + shy);
+ x_ci->iz_S0 = gmx_set1_pr(x[ia + 2*STRIDE_S ] + shz);
+ x_ci->ix_S1 = gmx_set1_pr(x[ia + 0*STRIDE_S + 1] + shx);
+ x_ci->iy_S1 = gmx_set1_pr(x[ia + 1*STRIDE_S + 1] + shy);
+ x_ci->iz_S1 = gmx_set1_pr(x[ia + 2*STRIDE_S + 1] + shz);
+ x_ci->ix_S2 = gmx_set1_pr(x[ia + 0*STRIDE_S + 2] + shx);
+ x_ci->iy_S2 = gmx_set1_pr(x[ia + 1*STRIDE_S + 2] + shy);
+ x_ci->iz_S2 = gmx_set1_pr(x[ia + 2*STRIDE_S + 2] + shz);
+ x_ci->ix_S3 = gmx_set1_pr(x[ia + 0*STRIDE_S + 3] + shx);
+ x_ci->iy_S3 = gmx_set1_pr(x[ia + 1*STRIDE_S + 3] + shy);
+ x_ci->iz_S3 = gmx_set1_pr(x[ia + 2*STRIDE_S + 3] + shz);
}
-#ifndef GMX_SIMD_HAVE_ANYTRUE
-/* Fallback function in case gmx_anytrue_pr is not present */
-static gmx_inline gmx_bool
-gmx_anytrue_4xn_pb(gmx_mm_pb bool_S)
-{
- real bools_array[2*GMX_SIMD_WIDTH_HERE], *bools;
- gmx_bool any;
- int s;
-
- bools = gmx_simd_align_real(bools_array);
-
- gmx_store_pb(bools, bool_S);
-
- any = FALSE;
- for (s = 0; s < GMX_SIMD_WIDTH_HERE; s++)
- {
- if (GMX_SIMD_IS_TRUE(bools[s]))
- {
- any = TRUE;
- }
- }
-
- return any;
-}
-#endif
-
/* SIMD code for making a pair list of cell ci vs cell cjf-cjl
* for coordinates in packed format.
* Checks bouding box distances and possibly atom pair distances.
const nbnxn_x_ci_simd_4xn_t *work;
const nbnxn_bb_t *bb_ci;
- gmx_mm_pr jx_SSE, jy_SSE, jz_SSE;
+ gmx_mm_pr jx_S, jy_S, jz_S;
- gmx_mm_pr dx_SSE0, dy_SSE0, dz_SSE0;
- gmx_mm_pr dx_SSE1, dy_SSE1, dz_SSE1;
- gmx_mm_pr dx_SSE2, dy_SSE2, dz_SSE2;
- gmx_mm_pr dx_SSE3, dy_SSE3, dz_SSE3;
+ gmx_mm_pr dx_S0, dy_S0, dz_S0;
+ gmx_mm_pr dx_S1, dy_S1, dz_S1;
+ gmx_mm_pr dx_S2, dy_S2, dz_S2;
+ gmx_mm_pr dx_S3, dy_S3, dz_S3;
- gmx_mm_pr rsq_SSE0;
- gmx_mm_pr rsq_SSE1;
- gmx_mm_pr rsq_SSE2;
- gmx_mm_pr rsq_SSE3;
+ gmx_mm_pr rsq_S0;
+ gmx_mm_pr rsq_S1;
+ gmx_mm_pr rsq_S2;
+ gmx_mm_pr rsq_S3;
- gmx_mm_pb wco_SSE0;
- gmx_mm_pb wco_SSE1;
- gmx_mm_pb wco_SSE2;
- gmx_mm_pb wco_SSE3;
- gmx_mm_pb wco_any_SSE01, wco_any_SSE23, wco_any_SSE;
+ gmx_mm_pb wco_S0;
+ gmx_mm_pb wco_S1;
+ gmx_mm_pb wco_S2;
+ gmx_mm_pb wco_S3;
+ gmx_mm_pb wco_any_S01, wco_any_S23, wco_any_S;
- gmx_mm_pr rc2_SSE;
+ gmx_mm_pr rc2_S;
gmx_bool InRange;
float d2;
bb_ci = nbl->work->bb_ci;
- rc2_SSE = gmx_set1_pr(rl2);
+ rc2_S = gmx_set1_pr(rl2);
InRange = FALSE;
while (!InRange && cjf <= cjl)
{
-#ifdef NBNXN_SEARCH_BB_SSE
- d2 = subc_bb_dist2_sse(0, bb_ci, cjf, gridj->bbj);
+#ifdef NBNXN_SEARCH_BB_SIMD4
+ d2 = subc_bb_dist2_simd4(0, bb_ci, cjf, gridj->bbj);
#else
d2 = subc_bb_dist2(0, bb_ci, cjf, gridj->bbj);
#endif
{
xind_f = X_IND_CJ_SIMD_4XN(CI_TO_CJ_SIMD_4XN(gridj->cell0) + cjf);
- jx_SSE = gmx_load_pr(x_j+xind_f+0*STRIDE_S);
- jy_SSE = gmx_load_pr(x_j+xind_f+1*STRIDE_S);
- jz_SSE = gmx_load_pr(x_j+xind_f+2*STRIDE_S);
+ jx_S = gmx_load_pr(x_j+xind_f+0*STRIDE_S);
+ jy_S = gmx_load_pr(x_j+xind_f+1*STRIDE_S);
+ jz_S = gmx_load_pr(x_j+xind_f+2*STRIDE_S);
/* Calculate distance */
- dx_SSE0 = gmx_sub_pr(work->ix_SSE0, jx_SSE);
- dy_SSE0 = gmx_sub_pr(work->iy_SSE0, jy_SSE);
- dz_SSE0 = gmx_sub_pr(work->iz_SSE0, jz_SSE);
- dx_SSE1 = gmx_sub_pr(work->ix_SSE1, jx_SSE);
- dy_SSE1 = gmx_sub_pr(work->iy_SSE1, jy_SSE);
- dz_SSE1 = gmx_sub_pr(work->iz_SSE1, jz_SSE);
- dx_SSE2 = gmx_sub_pr(work->ix_SSE2, jx_SSE);
- dy_SSE2 = gmx_sub_pr(work->iy_SSE2, jy_SSE);
- dz_SSE2 = gmx_sub_pr(work->iz_SSE2, jz_SSE);
- dx_SSE3 = gmx_sub_pr(work->ix_SSE3, jx_SSE);
- dy_SSE3 = gmx_sub_pr(work->iy_SSE3, jy_SSE);
- dz_SSE3 = gmx_sub_pr(work->iz_SSE3, jz_SSE);
+ dx_S0 = gmx_sub_pr(work->ix_S0, jx_S);
+ dy_S0 = gmx_sub_pr(work->iy_S0, jy_S);
+ dz_S0 = gmx_sub_pr(work->iz_S0, jz_S);
+ dx_S1 = gmx_sub_pr(work->ix_S1, jx_S);
+ dy_S1 = gmx_sub_pr(work->iy_S1, jy_S);
+ dz_S1 = gmx_sub_pr(work->iz_S1, jz_S);
+ dx_S2 = gmx_sub_pr(work->ix_S2, jx_S);
+ dy_S2 = gmx_sub_pr(work->iy_S2, jy_S);
+ dz_S2 = gmx_sub_pr(work->iz_S2, jz_S);
+ dx_S3 = gmx_sub_pr(work->ix_S3, jx_S);
+ dy_S3 = gmx_sub_pr(work->iy_S3, jy_S);
+ dz_S3 = gmx_sub_pr(work->iz_S3, jz_S);
/* rsq = dx*dx+dy*dy+dz*dz */
- rsq_SSE0 = gmx_calc_rsq_pr(dx_SSE0, dy_SSE0, dz_SSE0);
- rsq_SSE1 = gmx_calc_rsq_pr(dx_SSE1, dy_SSE1, dz_SSE1);
- rsq_SSE2 = gmx_calc_rsq_pr(dx_SSE2, dy_SSE2, dz_SSE2);
- rsq_SSE3 = gmx_calc_rsq_pr(dx_SSE3, dy_SSE3, dz_SSE3);
-
- wco_SSE0 = gmx_cmplt_pr(rsq_SSE0, rc2_SSE);
- wco_SSE1 = gmx_cmplt_pr(rsq_SSE1, rc2_SSE);
- wco_SSE2 = gmx_cmplt_pr(rsq_SSE2, rc2_SSE);
- wco_SSE3 = gmx_cmplt_pr(rsq_SSE3, rc2_SSE);
-
- wco_any_SSE01 = gmx_or_pb(wco_SSE0, wco_SSE1);
- wco_any_SSE23 = gmx_or_pb(wco_SSE2, wco_SSE3);
- wco_any_SSE = gmx_or_pb(wco_any_SSE01, wco_any_SSE23);
-
-#ifdef GMX_SIMD_HAVE_ANYTRUE
- InRange = gmx_anytrue_pb(wco_any_SSE);
-#else
- InRange = gmx_anytrue_4xn_pb(wco_any_SSE);
-#endif
+ rsq_S0 = gmx_calc_rsq_pr(dx_S0, dy_S0, dz_S0);
+ rsq_S1 = gmx_calc_rsq_pr(dx_S1, dy_S1, dz_S1);
+ rsq_S2 = gmx_calc_rsq_pr(dx_S2, dy_S2, dz_S2);
+ rsq_S3 = gmx_calc_rsq_pr(dx_S3, dy_S3, dz_S3);
+
+ wco_S0 = gmx_cmplt_pr(rsq_S0, rc2_S);
+ wco_S1 = gmx_cmplt_pr(rsq_S1, rc2_S);
+ wco_S2 = gmx_cmplt_pr(rsq_S2, rc2_S);
+ wco_S3 = gmx_cmplt_pr(rsq_S3, rc2_S);
+
+ wco_any_S01 = gmx_or_pb(wco_S0, wco_S1);
+ wco_any_S23 = gmx_or_pb(wco_S2, wco_S3);
+ wco_any_S = gmx_or_pb(wco_any_S01, wco_any_S23);
+
+ InRange = gmx_anytrue_pb(wco_any_S);
*ndistc += 4*GMX_SIMD_WIDTH_HERE;
}
InRange = FALSE;
while (!InRange && cjl > cjf)
{
-#ifdef NBNXN_SEARCH_BB_SSE
- d2 = subc_bb_dist2_sse(0, bb_ci, cjl, gridj->bbj);
+#ifdef NBNXN_SEARCH_BB_SIMD4
+ d2 = subc_bb_dist2_simd4(0, bb_ci, cjl, gridj->bbj);
#else
d2 = subc_bb_dist2(0, bb_ci, cjl, gridj->bbj);
#endif
{
xind_l = X_IND_CJ_SIMD_4XN(CI_TO_CJ_SIMD_4XN(gridj->cell0) + cjl);
- jx_SSE = gmx_load_pr(x_j+xind_l+0*STRIDE_S);
- jy_SSE = gmx_load_pr(x_j+xind_l+1*STRIDE_S);
- jz_SSE = gmx_load_pr(x_j+xind_l+2*STRIDE_S);
+ jx_S = gmx_load_pr(x_j+xind_l+0*STRIDE_S);
+ jy_S = gmx_load_pr(x_j+xind_l+1*STRIDE_S);
+ jz_S = gmx_load_pr(x_j+xind_l+2*STRIDE_S);
/* Calculate distance */
- dx_SSE0 = gmx_sub_pr(work->ix_SSE0, jx_SSE);
- dy_SSE0 = gmx_sub_pr(work->iy_SSE0, jy_SSE);
- dz_SSE0 = gmx_sub_pr(work->iz_SSE0, jz_SSE);
- dx_SSE1 = gmx_sub_pr(work->ix_SSE1, jx_SSE);
- dy_SSE1 = gmx_sub_pr(work->iy_SSE1, jy_SSE);
- dz_SSE1 = gmx_sub_pr(work->iz_SSE1, jz_SSE);
- dx_SSE2 = gmx_sub_pr(work->ix_SSE2, jx_SSE);
- dy_SSE2 = gmx_sub_pr(work->iy_SSE2, jy_SSE);
- dz_SSE2 = gmx_sub_pr(work->iz_SSE2, jz_SSE);
- dx_SSE3 = gmx_sub_pr(work->ix_SSE3, jx_SSE);
- dy_SSE3 = gmx_sub_pr(work->iy_SSE3, jy_SSE);
- dz_SSE3 = gmx_sub_pr(work->iz_SSE3, jz_SSE);
+ dx_S0 = gmx_sub_pr(work->ix_S0, jx_S);
+ dy_S0 = gmx_sub_pr(work->iy_S0, jy_S);
+ dz_S0 = gmx_sub_pr(work->iz_S0, jz_S);
+ dx_S1 = gmx_sub_pr(work->ix_S1, jx_S);
+ dy_S1 = gmx_sub_pr(work->iy_S1, jy_S);
+ dz_S1 = gmx_sub_pr(work->iz_S1, jz_S);
+ dx_S2 = gmx_sub_pr(work->ix_S2, jx_S);
+ dy_S2 = gmx_sub_pr(work->iy_S2, jy_S);
+ dz_S2 = gmx_sub_pr(work->iz_S2, jz_S);
+ dx_S3 = gmx_sub_pr(work->ix_S3, jx_S);
+ dy_S3 = gmx_sub_pr(work->iy_S3, jy_S);
+ dz_S3 = gmx_sub_pr(work->iz_S3, jz_S);
/* rsq = dx*dx+dy*dy+dz*dz */
- rsq_SSE0 = gmx_calc_rsq_pr(dx_SSE0, dy_SSE0, dz_SSE0);
- rsq_SSE1 = gmx_calc_rsq_pr(dx_SSE1, dy_SSE1, dz_SSE1);
- rsq_SSE2 = gmx_calc_rsq_pr(dx_SSE2, dy_SSE2, dz_SSE2);
- rsq_SSE3 = gmx_calc_rsq_pr(dx_SSE3, dy_SSE3, dz_SSE3);
-
- wco_SSE0 = gmx_cmplt_pr(rsq_SSE0, rc2_SSE);
- wco_SSE1 = gmx_cmplt_pr(rsq_SSE1, rc2_SSE);
- wco_SSE2 = gmx_cmplt_pr(rsq_SSE2, rc2_SSE);
- wco_SSE3 = gmx_cmplt_pr(rsq_SSE3, rc2_SSE);
-
- wco_any_SSE01 = gmx_or_pb(wco_SSE0, wco_SSE1);
- wco_any_SSE23 = gmx_or_pb(wco_SSE2, wco_SSE3);
- wco_any_SSE = gmx_or_pb(wco_any_SSE01, wco_any_SSE23);
-
-#ifdef GMX_SIMD_HAVE_ANYTRUE
- InRange = gmx_anytrue_pb(wco_any_SSE);
-#else
- InRange = gmx_anytrue_4xn_pb(wco_any_SSE);
-#endif
+ rsq_S0 = gmx_calc_rsq_pr(dx_S0, dy_S0, dz_S0);
+ rsq_S1 = gmx_calc_rsq_pr(dx_S1, dy_S1, dz_S1);
+ rsq_S2 = gmx_calc_rsq_pr(dx_S2, dy_S2, dz_S2);
+ rsq_S3 = gmx_calc_rsq_pr(dx_S3, dy_S3, dz_S3);
+
+ wco_S0 = gmx_cmplt_pr(rsq_S0, rc2_S);
+ wco_S1 = gmx_cmplt_pr(rsq_S1, rc2_S);
+ wco_S2 = gmx_cmplt_pr(rsq_S2, rc2_S);
+ wco_S3 = gmx_cmplt_pr(rsq_S3, rc2_S);
+
+ wco_any_S01 = gmx_or_pb(wco_S0, wco_S1);
+ wco_any_S23 = gmx_or_pb(wco_S2, wco_S3);
+ wco_any_S = gmx_or_pb(wco_any_S01, wco_any_S23);
+
+ InRange = gmx_anytrue_pb(wco_any_S);
*ndistc += 4*GMX_SIMD_WIDTH_HERE;
}
/* Include the SIMD macro file and then check for support */
#include "gmx_simd_macros.h"
#if defined GMX_HAVE_SIMD_MACROS && defined GMX_SIMD_HAVE_EXP
-/* Turn on SIMD intrinsics for PME solve */
+/* Turn on arbitrary width SIMD intrinsics for PME solve */
#define PME_SIMD
#endif
-/* SIMD spread+gather only in single precision with SSE2 or higher available.
- * We might want to switch to use gmx_simd_macros.h, but this is somewhat
- * complicated, as we use unaligned and/or 4-wide only loads.
+/* Include the 4-wide SIMD macro file */
+#include "gmx_simd4_macros.h"
+/* Check if we have 4-wide SIMD macro support */
+#ifdef GMX_HAVE_SIMD4_MACROS
+/* Do PME spread and gather with 4-wide SIMD.
+ * NOTE: SIMD is only used with PME order 4 and 5 (which are the most common).
*/
-#if defined(GMX_X86_SSE2) && !defined(GMX_DOUBLE)
-#define PME_SSE_SPREAD_GATHER
-#include <emmintrin.h>
-/* Some old AMD processors could have problems with unaligned loads+stores */
-#ifndef GMX_FAHCORE
-#define PME_SSE_UNALIGNED
+#define PME_SIMD4_SPREAD_GATHER
+
+#ifdef GMX_SIMD4_HAVE_UNALIGNED
+/* With PME-order=4 on x86, unaligned load+store is slightly faster
+ * than doubling all SIMD operations when using aligned load+store.
+ */
+#define PME_SIMD4_UNALIGNED
#endif
#endif
+
#include "mpelogging.h"
#define DFT_TOL 1e-7
#define mpi_type MPI_FLOAT
#endif
-/* GMX_CACHE_SEP should be a multiple of 16 to preserve alignment */
+#ifdef PME_SIMD4_SPREAD_GATHER
+#define SIMD4_ALIGNMENT (GMX_SIMD4_WIDTH*sizeof(real))
+#else
+/* We can use any alignment, apart from 0, so we use 4 reals */
+#define SIMD4_ALIGNMENT (4*sizeof(real))
+#endif
+
+/* GMX_CACHE_SEP should be a multiple of the SIMD and SIMD4 register size
+ * to preserve alignment.
+ */
#define GMX_CACHE_SEP 64
/* We only define a maximum to be able to use local arrays without allocation.
typedef struct {
-#ifdef PME_SSE_SPREAD_GATHER
- /* Masks for SSE aligned spreading and gathering */
- __m128 mask_SSE0[6], mask_SSE1[6];
+#ifdef PME_SIMD4_SPREAD_GATHER
+ /* Masks for 4-wide SIMD aligned spreading and gathering */
+ gmx_simd4_pb mask_S0[6], mask_S1[6];
#else
int dummy; /* C89 requires that struct has at least one member */
#endif
srenew(th[XX], nalloc);
srenew(th[YY], nalloc);
- /* In z we add padding, this is only required for the aligned SSE code */
- srenew(*ptr_z, nalloc+2*padding);
+ /* In z we add padding, this is only required for the aligned SIMD code */
+ sfree_aligned(*ptr_z);
+ snew_aligned(*ptr_z, nalloc+2*padding, SIMD4_ALIGNMENT);
th[ZZ] = *ptr_z + padding;
for (i = 0; i < padding; i++)
int pnx, pny, pnz, ndatatot;
int offx, offy, offz;
+#if defined PME_SIMD4_SPREAD_GATHER && !defined PME_SIMD4_UNALIGNED
+ real thz_buffer[12], *thz_aligned;
+
+ thz_aligned = gmx_simd4_align_real(thz_buffer);
+#endif
+
pnx = pmegrid->s[XX];
pny = pmegrid->s[YY];
pnz = pmegrid->s[ZZ];
switch (order)
{
case 4:
-#ifdef PME_SSE_SPREAD_GATHER
-#ifdef PME_SSE_UNALIGNED
-#define PME_SPREAD_SSE_ORDER4
+#ifdef PME_SIMD4_SPREAD_GATHER
+#ifdef PME_SIMD4_UNALIGNED
+#define PME_SPREAD_SIMD4_ORDER4
#else
-#define PME_SPREAD_SSE_ALIGNED
+#define PME_SPREAD_SIMD4_ALIGNED
#define PME_ORDER 4
#endif
-#include "pme_sse_single.h"
+#include "pme_simd4.h"
#else
DO_BSPLINE(4);
#endif
break;
case 5:
-#ifdef PME_SSE_SPREAD_GATHER
-#define PME_SPREAD_SSE_ALIGNED
+#ifdef PME_SIMD4_SPREAD_GATHER
+#define PME_SPREAD_SIMD4_ALIGNED
#define PME_ORDER 5
-#include "pme_sse_single.h"
+#include "pme_simd4.h"
#else
DO_BSPLINE(5);
#endif
static void set_grid_alignment(int *pmegrid_nz, int pme_order)
{
-#ifdef PME_SSE_SPREAD_GATHER
+#ifdef PME_SIMD4_SPREAD_GATHER
if (pme_order == 5
-#ifndef PME_SSE_UNALIGNED
+#ifndef PME_SIMD4_UNALIGNED
|| pme_order == 4
#endif
)
static void set_gridsize_alignment(int *gridsize, int pme_order)
{
-#ifdef PME_SSE_SPREAD_GATHER
-#ifndef PME_SSE_UNALIGNED
+#ifdef PME_SIMD4_SPREAD_GATHER
+#ifndef PME_SIMD4_UNALIGNED
if (pme_order == 4)
{
/* Add extra elements to ensured aligned operations do not go
{
gridsize = grid->s[XX]*grid->s[YY]*grid->s[ZZ];
set_gridsize_alignment(&gridsize, pme_order);
- snew_aligned(grid->grid, gridsize, 16);
+ snew_aligned(grid->grid, gridsize, SIMD4_ALIGNMENT);
}
else
{
set_gridsize_alignment(&gridsize, pme_order);
snew_aligned(grids->grid_all,
grids->nthread*gridsize+(grids->nthread+1)*GMX_CACHE_SEP,
- 16);
+ SIMD4_ALIGNMENT);
for (x = 0; x < grids->nc[XX]; x++)
{
static void realloc_work(pme_work_t *work, int nkx)
{
+ int simd_width;
+
if (nkx > work->nalloc)
{
work->nalloc = nkx;
* elements at the end for padding.
*/
#ifdef PME_SIMD
-#define ALIGN_HERE GMX_SIMD_WIDTH_HERE
+ simd_width = GMX_SIMD_WIDTH_HERE;
#else
-/* We can use any alignment, apart from 0, so we use 4 */
-#define ALIGN_HERE 4
+ /* We can use any alignment, apart from 0, so we use 4 */
+ simd_width = 4;
#endif
sfree_aligned(work->denom);
sfree_aligned(work->tmp1);
sfree_aligned(work->eterm);
- snew_aligned(work->denom, work->nalloc+ALIGN_HERE, ALIGN_HERE*sizeof(real));
- snew_aligned(work->tmp1, work->nalloc+ALIGN_HERE, ALIGN_HERE*sizeof(real));
- snew_aligned(work->eterm, work->nalloc+ALIGN_HERE, ALIGN_HERE*sizeof(real));
+ snew_aligned(work->denom, work->nalloc+simd_width, simd_width*sizeof(real));
+ snew_aligned(work->tmp1, work->nalloc+simd_width, simd_width*sizeof(real));
+ snew_aligned(work->eterm, work->nalloc+simd_width, simd_width*sizeof(real));
srenew(work->m2inv, work->nalloc);
}
}
gmx_mm_pr lu;
gmx_mm_pr tmp_d1, d_inv, tmp_r, tmp_e;
int kx;
- f_simd = gmx_load1_pr(&f);
+ f_simd = gmx_set1_pr(f);
for (kx = 0; kx < end; kx += GMX_SIMD_WIDTH_HERE)
{
tmp_d1 = gmx_load_pr(d_aligned+kx);
pme_spline_work_t *work;
+#if defined PME_SIMD4_SPREAD_GATHER && !defined PME_SIMD4_UNALIGNED
+ real thz_buffer[12], *thz_aligned;
+ real dthz_buffer[12], *dthz_aligned;
+
+ thz_aligned = gmx_simd4_align_real(thz_buffer);
+ dthz_aligned = gmx_simd4_align_real(dthz_buffer);
+#endif
+
work = pme->spline_work;
order = pme->pme_order;
switch (order)
{
case 4:
-#ifdef PME_SSE_SPREAD_GATHER
-#ifdef PME_SSE_UNALIGNED
-#define PME_GATHER_F_SSE_ORDER4
+#ifdef PME_SIMD4_SPREAD_GATHER
+#ifdef PME_SIMD4_UNALIGNED
+#define PME_GATHER_F_SIMD4_ORDER4
#else
-#define PME_GATHER_F_SSE_ALIGNED
+#define PME_GATHER_F_SIMD4_ALIGNED
#define PME_ORDER 4
#endif
-#include "pme_sse_single.h"
+#include "pme_simd4.h"
#else
DO_FSPLINE(4);
#endif
break;
case 5:
-#ifdef PME_SSE_SPREAD_GATHER
-#define PME_GATHER_F_SSE_ALIGNED
+#ifdef PME_SIMD4_SPREAD_GATHER
+#define PME_GATHER_F_SIMD4_ALIGNED
#define PME_ORDER 5
-#include "pme_sse_single.h"
+#include "pme_simd4.h"
#else
DO_FSPLINE(5);
#endif
{
pme_spline_work_t *work;
-#ifdef PME_SSE_SPREAD_GATHER
- float tmp[8];
- __m128 zero_SSE;
- int of, i;
+#ifdef PME_SIMD4_SPREAD_GATHER
+ real tmp[12], *tmp_aligned;
+ gmx_simd4_pr zero_S;
+ gmx_simd4_pr real_mask_S0, real_mask_S1;
+ int of, i;
+
+ snew_aligned(work, 1, SIMD4_ALIGNMENT);
- snew_aligned(work, 1, 16);
+ tmp_aligned = gmx_simd4_align_real(tmp);
- zero_SSE = _mm_setzero_ps();
+ zero_S = gmx_simd4_setzero_pr();
/* Generate bit masks to mask out the unused grid entries,
* as we only operate on order of the 8 grid entries that are
- * load into 2 SSE float registers.
+ * load into 2 SIMD registers.
*/
for (of = 0; of < 8-(order-1); of++)
{
for (i = 0; i < 8; i++)
{
- tmp[i] = (i >= of && i < of+order ? 1 : 0);
+ tmp_aligned[i] = (i >= of && i < of+order ? -1.0 : 1.0);
}
- work->mask_SSE0[of] = _mm_loadu_ps(tmp);
- work->mask_SSE1[of] = _mm_loadu_ps(tmp+4);
- work->mask_SSE0[of] = _mm_cmpgt_ps(work->mask_SSE0[of], zero_SSE);
- work->mask_SSE1[of] = _mm_cmpgt_ps(work->mask_SSE1[of], zero_SSE);
+ real_mask_S0 = gmx_simd4_load_pr(tmp_aligned);
+ real_mask_S1 = gmx_simd4_load_pr(tmp_aligned+4);
+ work->mask_S0[of] = gmx_simd4_cmplt_pr(real_mask_S0, zero_S);
+ work->mask_S1[of] = gmx_simd4_cmplt_pr(real_mask_S1, zero_S);
}
#else
work = NULL;
--- /dev/null
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2004, The GROMACS development team,
+ * check out http://www.gromacs.org for more information.
+ * Copyright (c) 2012,2013, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+/* This include file has code between ifdef's to make sure
+ * that this performance sensitive code is inlined
+ * and to remove conditionals and variable loop bounds at compile time.
+ */
+
+#ifdef PME_SPREAD_SIMD4_ORDER4
+/* Spread one charge with pme_order=4 with unaligned SIMD4 load+store.
+ * This code does not assume any memory alignment for the grid.
+ */
+{
+ gmx_simd4_pr ty_S0, ty_S1, ty_S2, ty_S3;
+ gmx_simd4_pr tz_S;
+ gmx_simd4_pr vx_S;
+ gmx_simd4_pr vx_tz_S;
+ gmx_simd4_pr sum_S0, sum_S1, sum_S2, sum_S3;
+ gmx_simd4_pr gri_S0, gri_S1, gri_S2, gri_S3;
+
+ ty_S0 = gmx_simd4_set1_pr(thy[0]);
+ ty_S1 = gmx_simd4_set1_pr(thy[1]);
+ ty_S2 = gmx_simd4_set1_pr(thy[2]);
+ ty_S3 = gmx_simd4_set1_pr(thy[3]);
+
+ /* With order 4 the z-spline is actually aligned */
+ tz_S = gmx_simd4_load_pr(thz);
+
+ for (ithx = 0; (ithx < 4); ithx++)
+ {
+ index_x = (i0+ithx)*pny*pnz;
+ valx = qn*thx[ithx];
+
+ vx_S = gmx_simd4_set1_pr(valx);
+
+ vx_tz_S = gmx_simd4_mul_pr(vx_S, tz_S);
+
+ gri_S0 = gmx_simd4_loadu_pr(grid+index_x+(j0+0)*pnz+k0);
+ gri_S1 = gmx_simd4_loadu_pr(grid+index_x+(j0+1)*pnz+k0);
+ gri_S2 = gmx_simd4_loadu_pr(grid+index_x+(j0+2)*pnz+k0);
+ gri_S3 = gmx_simd4_loadu_pr(grid+index_x+(j0+3)*pnz+k0);
+
+ sum_S0 = gmx_simd4_madd_pr(vx_tz_S, ty_S0, gri_S0);
+ sum_S1 = gmx_simd4_madd_pr(vx_tz_S, ty_S1, gri_S1);
+ sum_S2 = gmx_simd4_madd_pr(vx_tz_S, ty_S2, gri_S2);
+ sum_S3 = gmx_simd4_madd_pr(vx_tz_S, ty_S3, gri_S3);
+
+ gmx_simd4_storeu_pr(grid+index_x+(j0+0)*pnz+k0, sum_S0);
+ gmx_simd4_storeu_pr(grid+index_x+(j0+1)*pnz+k0, sum_S1);
+ gmx_simd4_storeu_pr(grid+index_x+(j0+2)*pnz+k0, sum_S2);
+ gmx_simd4_storeu_pr(grid+index_x+(j0+3)*pnz+k0, sum_S3);
+ }
+}
+#undef PME_SPREAD_SIMD4_ORDER4
+#endif
+
+
+#ifdef PME_GATHER_F_SIMD4_ORDER4
+/* Gather for one charge with pme_order=4 with unaligned SIMD4 load+store.
+ * This code does not assume any memory alignment for the grid.
+ */
+{
+ real fx_tmp[4], fy_tmp[4], fz_tmp[4];
+
+ gmx_simd4_pr fx_S, fy_S, fz_S;
+
+ gmx_simd4_pr tx_S, ty_S, tz_S;
+ gmx_simd4_pr dx_S, dy_S, dz_S;
+
+ gmx_simd4_pr gval_S;
+
+ gmx_simd4_pr fxy1_S;
+ gmx_simd4_pr fz1_S;
+
+ fx_S = gmx_simd4_setzero_pr();
+ fy_S = gmx_simd4_setzero_pr();
+ fz_S = gmx_simd4_setzero_pr();
+
+ /* With order 4 the z-spline is actually aligned */
+ tz_S = gmx_simd4_load_pr(thz);
+ dz_S = gmx_simd4_load_pr(dthz);
+
+ for (ithx = 0; (ithx < 4); ithx++)
+ {
+ index_x = (i0+ithx)*pny*pnz;
+ tx_S = gmx_simd4_set1_pr(thx[ithx]);
+ dx_S = gmx_simd4_set1_pr(dthx[ithx]);
+
+ for (ithy = 0; (ithy < 4); ithy++)
+ {
+ index_xy = index_x+(j0+ithy)*pnz;
+ ty_S = gmx_simd4_set1_pr(thy[ithy]);
+ dy_S = gmx_simd4_set1_pr(dthy[ithy]);
+
+ gval_S = gmx_simd4_loadu_pr(grid+index_xy+k0);
+
+ fxy1_S = gmx_simd4_mul_pr(tz_S, gval_S);
+ fz1_S = gmx_simd4_mul_pr(dz_S, gval_S);
+
+ fx_S = gmx_simd4_madd_pr(gmx_simd4_mul_pr(dx_S, ty_S), fxy1_S, fx_S);
+ fy_S = gmx_simd4_madd_pr(gmx_simd4_mul_pr(tx_S, dy_S), fxy1_S, fy_S);
+ fz_S = gmx_simd4_madd_pr(gmx_simd4_mul_pr(tx_S, ty_S), fz1_S, fz_S);
+ }
+ }
+
+ gmx_simd4_storeu_pr(fx_tmp, fx_S);
+ gmx_simd4_storeu_pr(fy_tmp, fy_S);
+ gmx_simd4_storeu_pr(fz_tmp, fz_S);
+
+ fx += fx_tmp[0]+fx_tmp[1]+fx_tmp[2]+fx_tmp[3];
+ fy += fy_tmp[0]+fy_tmp[1]+fy_tmp[2]+fy_tmp[3];
+ fz += fz_tmp[0]+fz_tmp[1]+fz_tmp[2]+fz_tmp[3];
+}
+#undef PME_GATHER_F_SIMD4_ORDER4
+#endif
+
+
+#ifdef PME_SPREAD_SIMD4_ALIGNED
+/* This code assumes that the grid is allocated 4-real aligned
+ * and that pnz is a multiple of 4.
+ * This code supports pme_order <= 5.
+ */
+{
+ int offset;
+ int index;
+ gmx_simd4_pr ty_S0, ty_S1, ty_S2, ty_S3, ty_S4;
+ gmx_simd4_pr tz_S0;
+ gmx_simd4_pr tz_S1;
+ gmx_simd4_pr vx_S;
+ gmx_simd4_pr vx_tz_S0;
+ gmx_simd4_pr vx_tz_S1;
+ gmx_simd4_pr sum_S00, sum_S01, sum_S02, sum_S03, sum_S04;
+ gmx_simd4_pr sum_S10, sum_S11, sum_S12, sum_S13, sum_S14;
+ gmx_simd4_pr gri_S00, gri_S01, gri_S02, gri_S03, gri_S04;
+ gmx_simd4_pr gri_S10, gri_S11, gri_S12, gri_S13, gri_S14;
+
+ offset = k0 & 3;
+
+ ty_S0 = gmx_simd4_set1_pr(thy[0]);
+ ty_S1 = gmx_simd4_set1_pr(thy[1]);
+ ty_S2 = gmx_simd4_set1_pr(thy[2]);
+ ty_S3 = gmx_simd4_set1_pr(thy[3]);
+#if PME_ORDER == 5
+ ty_S4 = gmx_simd4_set1_pr(thy[4]);
+#endif
+
+#ifdef GMX_SIMD4_HAVE_UNALIGNED
+ tz_S0 = gmx_simd4_loadu_pr(thz-offset);
+ tz_S1 = gmx_simd4_loadu_pr(thz-offset+4);
+#else
+ {
+ int i;
+ /* Copy thz to an aligned buffer (unused buffer parts are masked) */
+ for (i = 0; i < PME_ORDER; i++)
+ {
+ thz_aligned[offset+i] = thz[i];
+ }
+ tz_S0 = gmx_simd4_load_pr(thz_aligned);
+ tz_S1 = gmx_simd4_load_pr(thz_aligned+4);
+ }
+#endif
+ tz_S0 = gmx_simd4_blendzero_pr(tz_S0, work->mask_S0[offset]);
+ tz_S1 = gmx_simd4_blendzero_pr(tz_S1, work->mask_S1[offset]);
+
+ for (ithx = 0; (ithx < PME_ORDER); ithx++)
+ {
+ index = (i0+ithx)*pny*pnz + j0*pnz + k0 - offset;
+ valx = qn*thx[ithx];
+
+ vx_S = gmx_simd4_set1_pr(valx);
+
+ vx_tz_S0 = gmx_simd4_mul_pr(vx_S, tz_S0);
+ vx_tz_S1 = gmx_simd4_mul_pr(vx_S, tz_S1);
+
+ gri_S00 = gmx_simd4_load_pr(grid+index+0*pnz);
+ gri_S01 = gmx_simd4_load_pr(grid+index+1*pnz);
+ gri_S02 = gmx_simd4_load_pr(grid+index+2*pnz);
+ gri_S03 = gmx_simd4_load_pr(grid+index+3*pnz);
+#if PME_ORDER == 5
+ gri_S04 = gmx_simd4_load_pr(grid+index+4*pnz);
+#endif
+ gri_S10 = gmx_simd4_load_pr(grid+index+0*pnz+4);
+ gri_S11 = gmx_simd4_load_pr(grid+index+1*pnz+4);
+ gri_S12 = gmx_simd4_load_pr(grid+index+2*pnz+4);
+ gri_S13 = gmx_simd4_load_pr(grid+index+3*pnz+4);
+#if PME_ORDER == 5
+ gri_S14 = gmx_simd4_load_pr(grid+index+4*pnz+4);
+#endif
+
+ sum_S00 = gmx_simd4_madd_pr(vx_tz_S0, ty_S0, gri_S00);
+ sum_S01 = gmx_simd4_madd_pr(vx_tz_S0, ty_S1, gri_S01);
+ sum_S02 = gmx_simd4_madd_pr(vx_tz_S0, ty_S2, gri_S02);
+ sum_S03 = gmx_simd4_madd_pr(vx_tz_S0, ty_S3, gri_S03);
+#if PME_ORDER == 5
+ sum_S04 = gmx_simd4_madd_pr(vx_tz_S0, ty_S4, gri_S04);
+#endif
+ sum_S10 = gmx_simd4_madd_pr(vx_tz_S1, ty_S0, gri_S10);
+ sum_S11 = gmx_simd4_madd_pr(vx_tz_S1, ty_S1, gri_S11);
+ sum_S12 = gmx_simd4_madd_pr(vx_tz_S1, ty_S2, gri_S12);
+ sum_S13 = gmx_simd4_madd_pr(vx_tz_S1, ty_S3, gri_S13);
+#if PME_ORDER == 5
+ sum_S14 = gmx_simd4_madd_pr(vx_tz_S1, ty_S4, gri_S14);
+#endif
+
+ gmx_simd4_store_pr(grid+index+0*pnz, sum_S00);
+ gmx_simd4_store_pr(grid+index+1*pnz, sum_S01);
+ gmx_simd4_store_pr(grid+index+2*pnz, sum_S02);
+ gmx_simd4_store_pr(grid+index+3*pnz, sum_S03);
+#if PME_ORDER == 5
+ gmx_simd4_store_pr(grid+index+4*pnz, sum_S04);
+#endif
+ gmx_simd4_store_pr(grid+index+0*pnz+4, sum_S10);
+ gmx_simd4_store_pr(grid+index+1*pnz+4, sum_S11);
+ gmx_simd4_store_pr(grid+index+2*pnz+4, sum_S12);
+ gmx_simd4_store_pr(grid+index+3*pnz+4, sum_S13);
+#if PME_ORDER == 5
+ gmx_simd4_store_pr(grid+index+4*pnz+4, sum_S14);
+#endif
+ }
+}
+#undef PME_ORDER
+#undef PME_SPREAD_SIMD4_ALIGNED
+#endif
+
+
+#ifdef PME_GATHER_F_SIMD4_ALIGNED
+/* This code assumes that the grid is allocated 4-real aligned
+ * and that pnz is a multiple of 4.
+ * This code supports pme_order <= 5.
+ */
+{
+ int offset;
+
+ real fx_tmp[4], fy_tmp[4], fz_tmp[4];
+
+ gmx_simd4_pr fx_S, fy_S, fz_S;
+
+ gmx_simd4_pr tx_S, ty_S, tz_S0, tz_S1;
+ gmx_simd4_pr dx_S, dy_S, dz_S0, dz_S1;
+
+ gmx_simd4_pr gval_S0;
+ gmx_simd4_pr gval_S1;
+
+ gmx_simd4_pr fxy1_S0;
+ gmx_simd4_pr fz1_S0;
+ gmx_simd4_pr fxy1_S1;
+ gmx_simd4_pr fz1_S1;
+ gmx_simd4_pr fxy1_S;
+ gmx_simd4_pr fz1_S;
+
+ offset = k0 & 3;
+
+ fx_S = gmx_simd4_setzero_pr();
+ fy_S = gmx_simd4_setzero_pr();
+ fz_S = gmx_simd4_setzero_pr();
+
+#ifdef GMX_SIMD4_HAVE_UNALIGNED
+ tz_S0 = gmx_simd4_loadu_pr(thz-offset);
+ tz_S1 = gmx_simd4_loadu_pr(thz-offset+4);
+ dz_S0 = gmx_simd4_loadu_pr(dthz-offset);
+ dz_S1 = gmx_simd4_loadu_pr(dthz-offset+4);
+#else
+ {
+ int i;
+ /* Copy (d)thz to an aligned buffer (unused buffer parts are masked) */
+ for (i = 0; i < PME_ORDER; i++)
+ {
+ thz_aligned[offset+i] = thz[i];
+ dthz_aligned[offset+i] = dthz[i];
+ }
+ tz_S0 = gmx_simd4_load_pr(thz_aligned);
+ tz_S1 = gmx_simd4_load_pr(thz_aligned+4);
+ dz_S0 = gmx_simd4_load_pr(dthz_aligned);
+ dz_S1 = gmx_simd4_load_pr(dthz_aligned+4);
+ }
+#endif
+ tz_S0 = gmx_simd4_blendzero_pr(tz_S0, work->mask_S0[offset]);
+ dz_S0 = gmx_simd4_blendzero_pr(dz_S0, work->mask_S0[offset]);
+ tz_S1 = gmx_simd4_blendzero_pr(tz_S1, work->mask_S1[offset]);
+ dz_S1 = gmx_simd4_blendzero_pr(dz_S1, work->mask_S1[offset]);
+
+ for (ithx = 0; (ithx < PME_ORDER); ithx++)
+ {
+ index_x = (i0+ithx)*pny*pnz;
+ tx_S = gmx_simd4_set1_pr(thx[ithx]);
+ dx_S = gmx_simd4_set1_pr(dthx[ithx]);
+
+ for (ithy = 0; (ithy < PME_ORDER); ithy++)
+ {
+ index_xy = index_x+(j0+ithy)*pnz;
+ ty_S = gmx_simd4_set1_pr(thy[ithy]);
+ dy_S = gmx_simd4_set1_pr(dthy[ithy]);
+
+ gval_S0 = gmx_simd4_load_pr(grid+index_xy+k0-offset);
+ gval_S1 = gmx_simd4_load_pr(grid+index_xy+k0-offset+4);
+
+ fxy1_S0 = gmx_simd4_mul_pr(tz_S0, gval_S0);
+ fz1_S0 = gmx_simd4_mul_pr(dz_S0, gval_S0);
+ fxy1_S1 = gmx_simd4_mul_pr(tz_S1, gval_S1);
+ fz1_S1 = gmx_simd4_mul_pr(dz_S1, gval_S1);
+
+ fxy1_S = gmx_simd4_add_pr(fxy1_S0, fxy1_S1);
+ fz1_S = gmx_simd4_add_pr(fz1_S0, fz1_S1);
+
+ fx_S = gmx_simd4_madd_pr(gmx_simd4_mul_pr(dx_S, ty_S), fxy1_S, fx_S);
+ fy_S = gmx_simd4_madd_pr(gmx_simd4_mul_pr(tx_S, dy_S), fxy1_S, fy_S);
+ fz_S = gmx_simd4_madd_pr(gmx_simd4_mul_pr(tx_S, ty_S), fz1_S, fz_S);
+ }
+ }
+
+ gmx_simd4_store_pr(fx_tmp, fx_S);
+ gmx_simd4_store_pr(fy_tmp, fy_S);
+ gmx_simd4_store_pr(fz_tmp, fz_S);
+
+ fx += fx_tmp[0]+fx_tmp[1]+fx_tmp[2]+fx_tmp[3];
+ fy += fy_tmp[0]+fy_tmp[1]+fy_tmp[2]+fy_tmp[3];
+ fz += fz_tmp[0]+fz_tmp[1]+fz_tmp[2]+fz_tmp[3];
+}
+#undef PME_ORDER
+#undef PME_GATHER_F_SIMD4_ALIGNED
+#endif
+++ /dev/null
-/*
- * This file is part of the GROMACS molecular simulation package.
- *
- * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
- * Copyright (c) 2001-2004, The GROMACS development team,
- * check out http://www.gromacs.org for more information.
- * Copyright (c) 2012,2013, by the GROMACS development team, led by
- * David van der Spoel, Berk Hess, Erik Lindahl, and including many
- * others, as listed in the AUTHORS file in the top-level source
- * directory and at http://www.gromacs.org.
- *
- * GROMACS is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public License
- * as published by the Free Software Foundation; either version 2.1
- * of the License, or (at your option) any later version.
- *
- * GROMACS is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with GROMACS; if not, see
- * http://www.gnu.org/licenses, or write to the Free Software Foundation,
- * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
- *
- * If you want to redistribute modifications to GROMACS, please
- * consider that scientific software is very special. Version
- * control is crucial - bugs must be traceable. We will be happy to
- * consider code for inclusion in the official distribution, but
- * derived work must not be called official GROMACS. Details are found
- * in the README & COPYING files - if they are missing, get the
- * official version at http://www.gromacs.org.
- *
- * To help us fund GROMACS development, we humbly ask that you cite
- * the research papers on the package. Check out http://www.gromacs.org.
- */
-
-/* This include file has code between ifdef's to make sure
- * that this performance sensitive code is inlined
- * and to remove conditionals and variable loop bounds at compile time.
- */
-
-#ifdef PME_SPREAD_SSE_ORDER4
-/* This code does not assume any memory alignment.
- * This code only works for pme_order = 4.
- */
-{
- __m128 ty_SSE0, ty_SSE1, ty_SSE2, ty_SSE3;
- __m128 tz_SSE;
- __m128 vx_SSE;
- __m128 vx_tz_SSE;
- __m128 sum_SSE0, sum_SSE1, sum_SSE2, sum_SSE3;
- __m128 gri_SSE0, gri_SSE1, gri_SSE2, gri_SSE3;
-
- ty_SSE0 = _mm_load1_ps(&thy[0]);
- ty_SSE1 = _mm_load1_ps(&thy[1]);
- ty_SSE2 = _mm_load1_ps(&thy[2]);
- ty_SSE3 = _mm_load1_ps(&thy[3]);
-
- tz_SSE = _mm_loadu_ps(thz);
-
- for (ithx = 0; (ithx < 4); ithx++)
- {
- index_x = (i0+ithx)*pny*pnz;
- valx = qn*thx[ithx];
-
- vx_SSE = _mm_load1_ps(&valx);
-
- vx_tz_SSE = _mm_mul_ps(vx_SSE, tz_SSE);
-
- gri_SSE0 = _mm_loadu_ps(grid+index_x+(j0+0)*pnz+k0);
- gri_SSE1 = _mm_loadu_ps(grid+index_x+(j0+1)*pnz+k0);
- gri_SSE2 = _mm_loadu_ps(grid+index_x+(j0+2)*pnz+k0);
- gri_SSE3 = _mm_loadu_ps(grid+index_x+(j0+3)*pnz+k0);
-
- sum_SSE0 = _mm_add_ps(gri_SSE0, _mm_mul_ps(vx_tz_SSE, ty_SSE0));
- sum_SSE1 = _mm_add_ps(gri_SSE1, _mm_mul_ps(vx_tz_SSE, ty_SSE1));
- sum_SSE2 = _mm_add_ps(gri_SSE2, _mm_mul_ps(vx_tz_SSE, ty_SSE2));
- sum_SSE3 = _mm_add_ps(gri_SSE3, _mm_mul_ps(vx_tz_SSE, ty_SSE3));
-
- _mm_storeu_ps(grid+index_x+(j0+0)*pnz+k0, sum_SSE0);
- _mm_storeu_ps(grid+index_x+(j0+1)*pnz+k0, sum_SSE1);
- _mm_storeu_ps(grid+index_x+(j0+2)*pnz+k0, sum_SSE2);
- _mm_storeu_ps(grid+index_x+(j0+3)*pnz+k0, sum_SSE3);
- }
-}
-#undef PME_SPREAD_SSE_ORDER4
-#endif
-
-
-#ifdef PME_GATHER_F_SSE_ORDER4
-/* This code does not assume any memory alignment.
- * This code only works for pme_order = 4.
- */
-{
- float fx_tmp[4], fy_tmp[4], fz_tmp[4];
-
- __m128 fx_SSE, fy_SSE, fz_SSE;
-
- __m128 tx_SSE, ty_SSE, tz_SSE;
- __m128 dx_SSE, dy_SSE, dz_SSE;
-
- __m128 gval_SSE;
-
- __m128 fxy1_SSE;
- __m128 fz1_SSE;
-
- fx_SSE = _mm_setzero_ps();
- fy_SSE = _mm_setzero_ps();
- fz_SSE = _mm_setzero_ps();
-
- tz_SSE = _mm_loadu_ps(thz);
- dz_SSE = _mm_loadu_ps(dthz);
-
- for (ithx = 0; (ithx < 4); ithx++)
- {
- index_x = (i0+ithx)*pny*pnz;
- tx_SSE = _mm_load1_ps(thx+ithx);
- dx_SSE = _mm_load1_ps(dthx+ithx);
-
- for (ithy = 0; (ithy < 4); ithy++)
- {
- index_xy = index_x+(j0+ithy)*pnz;
- ty_SSE = _mm_load1_ps(thy+ithy);
- dy_SSE = _mm_load1_ps(dthy+ithy);
-
- gval_SSE = _mm_loadu_ps(grid+index_xy+k0);
-
- fxy1_SSE = _mm_mul_ps(tz_SSE, gval_SSE);
- fz1_SSE = _mm_mul_ps(dz_SSE, gval_SSE);
-
- fx_SSE = _mm_add_ps(fx_SSE, _mm_mul_ps(_mm_mul_ps(dx_SSE, ty_SSE), fxy1_SSE));
- fy_SSE = _mm_add_ps(fy_SSE, _mm_mul_ps(_mm_mul_ps(tx_SSE, dy_SSE), fxy1_SSE));
- fz_SSE = _mm_add_ps(fz_SSE, _mm_mul_ps(_mm_mul_ps(tx_SSE, ty_SSE), fz1_SSE));
- }
- }
-
- _mm_storeu_ps(fx_tmp, fx_SSE);
- _mm_storeu_ps(fy_tmp, fy_SSE);
- _mm_storeu_ps(fz_tmp, fz_SSE);
-
- fx += fx_tmp[0]+fx_tmp[1]+fx_tmp[2]+fx_tmp[3];
- fy += fy_tmp[0]+fy_tmp[1]+fy_tmp[2]+fy_tmp[3];
- fz += fz_tmp[0]+fz_tmp[1]+fz_tmp[2]+fz_tmp[3];
-}
-#undef PME_GATHER_F_SSE_ORDER4
-#endif
-
-
-#ifdef PME_SPREAD_SSE_ALIGNED
-/* This code assumes that the grid is allocated 16 bit aligned
- * and that pnz is a multiple of 4.
- * This code supports pme_order <= 5.
- */
-{
- int offset;
- int index;
- __m128 ty_SSE0, ty_SSE1, ty_SSE2, ty_SSE3, ty_SSE4;
- __m128 tz_SSE0;
- __m128 tz_SSE1;
- __m128 vx_SSE;
- __m128 vx_tz_SSE0;
- __m128 vx_tz_SSE1;
- __m128 sum_SSE00, sum_SSE01, sum_SSE02, sum_SSE03, sum_SSE04;
- __m128 sum_SSE10, sum_SSE11, sum_SSE12, sum_SSE13, sum_SSE14;
- __m128 gri_SSE00, gri_SSE01, gri_SSE02, gri_SSE03, gri_SSE04;
- __m128 gri_SSE10, gri_SSE11, gri_SSE12, gri_SSE13, gri_SSE14;
-
- offset = k0 & 3;
-
- ty_SSE0 = _mm_load1_ps(&thy[0]);
- ty_SSE1 = _mm_load1_ps(&thy[1]);
- ty_SSE2 = _mm_load1_ps(&thy[2]);
- ty_SSE3 = _mm_load1_ps(&thy[3]);
-#if PME_ORDER == 5
- ty_SSE4 = _mm_load1_ps(&thy[4]);
-#endif
-
- tz_SSE0 = _mm_loadu_ps(thz-offset);
- tz_SSE1 = _mm_loadu_ps(thz-offset+4);
- tz_SSE0 = _mm_and_ps(tz_SSE0, work->mask_SSE0[offset]);
- tz_SSE1 = _mm_and_ps(tz_SSE1, work->mask_SSE1[offset]);
-
- for (ithx = 0; (ithx < PME_ORDER); ithx++)
- {
- index = (i0+ithx)*pny*pnz + j0*pnz + k0 - offset;
- valx = qn*thx[ithx];
-
- vx_SSE = _mm_load1_ps(&valx);
-
- vx_tz_SSE0 = _mm_mul_ps(vx_SSE, tz_SSE0);
- vx_tz_SSE1 = _mm_mul_ps(vx_SSE, tz_SSE1);
-
- gri_SSE00 = _mm_load_ps(grid+index+0*pnz);
- gri_SSE01 = _mm_load_ps(grid+index+1*pnz);
- gri_SSE02 = _mm_load_ps(grid+index+2*pnz);
- gri_SSE03 = _mm_load_ps(grid+index+3*pnz);
-#if PME_ORDER == 5
- gri_SSE04 = _mm_load_ps(grid+index+4*pnz);
-#endif
- gri_SSE10 = _mm_load_ps(grid+index+0*pnz+4);
- gri_SSE11 = _mm_load_ps(grid+index+1*pnz+4);
- gri_SSE12 = _mm_load_ps(grid+index+2*pnz+4);
- gri_SSE13 = _mm_load_ps(grid+index+3*pnz+4);
-#if PME_ORDER == 5
- gri_SSE14 = _mm_load_ps(grid+index+4*pnz+4);
-#endif
-
- sum_SSE00 = _mm_add_ps(gri_SSE00, _mm_mul_ps(vx_tz_SSE0, ty_SSE0));
- sum_SSE01 = _mm_add_ps(gri_SSE01, _mm_mul_ps(vx_tz_SSE0, ty_SSE1));
- sum_SSE02 = _mm_add_ps(gri_SSE02, _mm_mul_ps(vx_tz_SSE0, ty_SSE2));
- sum_SSE03 = _mm_add_ps(gri_SSE03, _mm_mul_ps(vx_tz_SSE0, ty_SSE3));
-#if PME_ORDER == 5
- sum_SSE04 = _mm_add_ps(gri_SSE04, _mm_mul_ps(vx_tz_SSE0, ty_SSE4));
-#endif
- sum_SSE10 = _mm_add_ps(gri_SSE10, _mm_mul_ps(vx_tz_SSE1, ty_SSE0));
- sum_SSE11 = _mm_add_ps(gri_SSE11, _mm_mul_ps(vx_tz_SSE1, ty_SSE1));
- sum_SSE12 = _mm_add_ps(gri_SSE12, _mm_mul_ps(vx_tz_SSE1, ty_SSE2));
- sum_SSE13 = _mm_add_ps(gri_SSE13, _mm_mul_ps(vx_tz_SSE1, ty_SSE3));
-#if PME_ORDER == 5
- sum_SSE14 = _mm_add_ps(gri_SSE14, _mm_mul_ps(vx_tz_SSE1, ty_SSE4));
-#endif
-
- _mm_store_ps(grid+index+0*pnz, sum_SSE00);
- _mm_store_ps(grid+index+1*pnz, sum_SSE01);
- _mm_store_ps(grid+index+2*pnz, sum_SSE02);
- _mm_store_ps(grid+index+3*pnz, sum_SSE03);
-#if PME_ORDER == 5
- _mm_store_ps(grid+index+4*pnz, sum_SSE04);
-#endif
- _mm_store_ps(grid+index+0*pnz+4, sum_SSE10);
- _mm_store_ps(grid+index+1*pnz+4, sum_SSE11);
- _mm_store_ps(grid+index+2*pnz+4, sum_SSE12);
- _mm_store_ps(grid+index+3*pnz+4, sum_SSE13);
-#if PME_ORDER == 5
- _mm_store_ps(grid+index+4*pnz+4, sum_SSE14);
-#endif
- }
-}
-#undef PME_ORDER
-#undef PME_SPREAD_SSE_ALIGNED
-#endif
-
-
-#ifdef PME_GATHER_F_SSE_ALIGNED
-/* This code assumes that the grid is allocated 16 bit aligned
- * and that pnz is a multiple of 4.
- * This code supports pme_order <= 5.
- */
-{
- int offset;
-
- float fx_tmp[4], fy_tmp[4], fz_tmp[4];
-
- __m128 fx_SSE, fy_SSE, fz_SSE;
-
- __m128 tx_SSE, ty_SSE, tz_SSE0, tz_SSE1;
- __m128 dx_SSE, dy_SSE, dz_SSE0, dz_SSE1;
-
- __m128 gval_SSE0;
- __m128 gval_SSE1;
-
- __m128 fxy1_SSE0;
- __m128 fz1_SSE0;
- __m128 fxy1_SSE1;
- __m128 fz1_SSE1;
- __m128 fxy1_SSE;
- __m128 fz1_SSE;
-
- offset = k0 & 3;
-
- fx_SSE = _mm_setzero_ps();
- fy_SSE = _mm_setzero_ps();
- fz_SSE = _mm_setzero_ps();
-
- tz_SSE0 = _mm_loadu_ps(thz-offset);
- dz_SSE0 = _mm_loadu_ps(dthz-offset);
- tz_SSE1 = _mm_loadu_ps(thz-offset+4);
- dz_SSE1 = _mm_loadu_ps(dthz-offset+4);
- tz_SSE0 = _mm_and_ps(tz_SSE0, work->mask_SSE0[offset]);
- dz_SSE0 = _mm_and_ps(dz_SSE0, work->mask_SSE0[offset]);
- tz_SSE1 = _mm_and_ps(tz_SSE1, work->mask_SSE1[offset]);
- dz_SSE1 = _mm_and_ps(dz_SSE1, work->mask_SSE1[offset]);
-
- for (ithx = 0; (ithx < PME_ORDER); ithx++)
- {
- index_x = (i0+ithx)*pny*pnz;
- tx_SSE = _mm_load1_ps(thx+ithx);
- dx_SSE = _mm_load1_ps(dthx+ithx);
-
- for (ithy = 0; (ithy < PME_ORDER); ithy++)
- {
- index_xy = index_x+(j0+ithy)*pnz;
- ty_SSE = _mm_load1_ps(thy+ithy);
- dy_SSE = _mm_load1_ps(dthy+ithy);
-
- gval_SSE0 = _mm_load_ps(grid+index_xy+k0-offset);
- gval_SSE1 = _mm_load_ps(grid+index_xy+k0-offset+4);
-
- fxy1_SSE0 = _mm_mul_ps(tz_SSE0, gval_SSE0);
- fz1_SSE0 = _mm_mul_ps(dz_SSE0, gval_SSE0);
- fxy1_SSE1 = _mm_mul_ps(tz_SSE1, gval_SSE1);
- fz1_SSE1 = _mm_mul_ps(dz_SSE1, gval_SSE1);
-
- fxy1_SSE = _mm_add_ps(fxy1_SSE0, fxy1_SSE1);
- fz1_SSE = _mm_add_ps(fz1_SSE0, fz1_SSE1);
-
- fx_SSE = _mm_add_ps(fx_SSE, _mm_mul_ps(_mm_mul_ps(dx_SSE, ty_SSE), fxy1_SSE));
- fy_SSE = _mm_add_ps(fy_SSE, _mm_mul_ps(_mm_mul_ps(tx_SSE, dy_SSE), fxy1_SSE));
- fz_SSE = _mm_add_ps(fz_SSE, _mm_mul_ps(_mm_mul_ps(tx_SSE, ty_SSE), fz1_SSE));
- }
- }
-
- _mm_store_ps(fx_tmp, fx_SSE);
- _mm_store_ps(fy_tmp, fy_SSE);
- _mm_store_ps(fz_tmp, fz_SSE);
-
- fx += fx_tmp[0]+fx_tmp[1]+fx_tmp[2]+fx_tmp[3];
- fy += fy_tmp[0]+fy_tmp[1]+fy_tmp[2]+fy_tmp[3];
- fz += fz_tmp[0]+fz_tmp[1]+fz_tmp[2]+fz_tmp[3];
-}
-#undef PME_ORDER
-#undef PME_GATHER_F_SSE_ALIGNED
-#endif