From: Berk Hess <hess@kth.se>
Date: Thu, 1 Aug 2013 12:34:22 +0000 (+0200)
Subject: introduced general 4-wide SIMD support
X-Git-Url: http://biod.pnpi.spb.ru/gitweb/?a=commitdiff_plain;h=b608083a6f73474bbc02b486c3792947128d8cff;p=alexxy%2Fgromacs.git

introduced general 4-wide SIMD support

PME spread+gather and the nbnxn search bounding box checks use
4-wide SIMD (as opposed to arbitrary width SIMD). This SSE code
has now been replaced by macros from gmx_simd4_macros.h.
pme_sse_single.h has been renamed to pme_simd4.h
This change is mainly refactoring; it only adds PME spread+gather
AVX acceleration in double precision plus a few FMA instructions.

Change-Id: Ia5e02295bb281a2e23d57f4c165f555de6744064
---

diff --git a/include/gmx_simd4_macros.h b/include/gmx_simd4_macros.h
new file mode 100644
index 0000000000..7ee1581807
--- /dev/null
+++ b/include/gmx_simd4_macros.h
@@ -0,0 +1,309 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ * Copyright (c) 2012,2013, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+/* The macros in this file are intended to be used for writing
+ * architecture-independent SIMD intrinsics code with a SIMD width of 4.
+ * To support a new architecture, adding macros here should be all
+ * that is needed.
+ *
+ * Note that this file is intended only for SIMD operations that require
+ * a SIMD width of 4. In general gmx_simd_macros.h provides wider hardware
+ * support, more functionality and higher performance, but the SIMD width is
+ * not necessarily equal to 4.
+ */
+
+#ifdef _gmx_simd4_macros_h_
+#error "gmx_simd4_macros.h included twice"
+#else
+#define _gmx_simd4_macros_h_
+
+
+/* The SIMD width here is always 4, since that is the whole point */
+#define GMX_SIMD4_WIDTH  4
+
+
+#if defined GMX_SIMD4_SINGLE || defined GMX_SIMD4_DOUBLE
+/* Precision set before inclusion, honour that request */
+#else
+/* Match precision to the Gromacs real precision */
+#ifdef GMX_DOUBLE
+#define GMX_SIMD4_DOUBLE
+#else
+#define GMX_SIMD4_SINGLE
+#endif
+#endif
+
+#ifdef GMX_SIMD4_DOUBLE
+typedef double  gmx_simd4_real;
+#endif
+#ifdef GMX_SIMD4_SINGLE
+typedef float   gmx_simd4_real;
+#endif
+
+/* Uncomment the next line, without other SIMD active, for testing plain-C */
+/* #define GMX_SIMD4_REFERENCE_PLAIN_C */
+#ifdef GMX_SIMD4_REFERENCE_PLAIN_C
+/* Plain C SIMD reference implementation, also serves as documentation */
+#define GMX_HAVE_SIMD4_MACROS
+
+/* Include plain-C reference implementation, also serves as documentation */
+#include "gmx_simd4_ref.h"
+
+/* float/double SIMD register type */
+#define gmx_simd4_pr  gmx_simd4_ref_pr
+
+/* boolean SIMD register type */
+#define gmx_simd4_pb  gmx_simd4_ref_pb
+
+#define gmx_simd4_load_pr       gmx_simd4_ref_load_pr
+#define gmx_simd4_set1_pr       gmx_simd4_ref_set1_pr
+#define gmx_simd4_setzero_pr    gmx_simd4_ref_setzero_pr
+#define gmx_simd4_store_pr      gmx_simd4_ref_store_pr
+
+/* Unaligned load+store are not required,
+ * but they can speed up the PME spread+gather operations.
+ */
+#define GMX_SIMD4_HAVE_UNALIGNED
+#ifdef GMX_SIMD4_HAVE_UNALIGNED
+#define gmx_simd4_loadu_pr      gmx_simd4_ref_load_pr
+#define gmx_simd4_storeu_pr     gmx_simd4_ref_store_pr
+#endif
+
+#define gmx_simd4_add_pr        gmx_simd4_ref_add_pr
+#define gmx_simd4_sub_pr        gmx_simd4_ref_sub_pr
+#define gmx_simd4_mul_pr        gmx_simd4_ref_mul_pr
+/* For the FMA macros below, aim for c=d in code, so FMA3 uses 1 instruction */
+#define gmx_simd4_madd_pr       gmx_simd4_ref_madd_pr
+#define gmx_simd4_nmsub_pr      gmx_simd4_ref_nmsub_pr
+
+#define gmx_simd4_dotproduct3   gmx_simd4_ref_dotproduct3
+
+#define gmx_simd4_min_pr        gmx_simd4_ref_min_pr
+#define gmx_simd4_max_pr        gmx_simd4_ref_max_pr
+
+#define gmx_simd4_blendzero_pr  gmx_simd4_ref_blendzero_pr
+
+/* Comparison */
+#define gmx_simd4_cmplt_pr      gmx_simd4_ref_cmplt_pr
+
+/* Logical operations on SIMD booleans */
+#define gmx_simd4_and_pb        gmx_simd4_ref_and_pb
+#define gmx_simd4_or_pb         gmx_simd4_ref_or_pb
+
+/* Returns a single int (0/1) which tells if any of the 4 booleans is True */
+#define gmx_simd4_anytrue_pb    gmx_simd4_ref_anytrue_pb
+
+#endif /* GMX_SIMD4_REFERENCE_PLAIN_C */
+
+
+/* The same SIMD macros can be translated to SIMD intrinsics (and compiled
+ * to instructions for) different SIMD width and float precision.
+ *
+ * On x86: The gmx_simd4 prefix is replaced by _mm_ or _mm256_ (SSE or AVX).
+ * The _pr suffix is replaced by _ps or _pd (for single or double precision).
+ * Compiler settings will decide if 128-bit intrinsics will
+ * be translated into SSE or AVX instructions.
+ */
+
+
+#ifdef GMX_X86_SSE2
+/* This is for general x86 SIMD instruction sets that also support SSE2 */
+
+#ifdef GMX_SIMD4_SINGLE
+#define GMX_HAVE_SIMD4_MACROS
+#endif
+
+#ifdef GMX_SIMD4_DOUBLE
+/* Note that here we will use 256-bit SIMD with GMX_X86_AVX_128_FMA.
+ * This is inconsistent naming wise, but should give the best performance.
+ */
+#if defined GMX_X86_AVX_128_FMA || defined GMX_X86_AVX_256
+#define GMX_HAVE_SIMD4_MACROS
+#endif
+#endif
+
+#ifdef GMX_HAVE_SIMD4_MACROS
+
+#if defined GMX_X86_AVX_128_FMA || defined GMX_X86_AVX_256
+
+#include <immintrin.h>
+#ifdef HAVE_X86INTRIN_H
+#include <x86intrin.h> /* FMA */
+#endif
+#ifdef HAVE_INTRIN_H
+#include <intrin.h> /* FMA MSVC */
+#endif
+
+#else
+#ifdef GMX_X86_SSE4_1
+#include <smmintrin.h>
+#else
+/* We only have SSE2 */
+#include <emmintrin.h>
+#endif
+#endif
+
+#ifdef GMX_SIMD4_SINGLE
+
+#define gmx_simd4_pr  __m128
+
+#define gmx_simd4_pb  __m128
+
+#define gmx_simd4_load_pr       _mm_load_ps
+#define gmx_simd4_set1_pr       _mm_set1_ps
+#define gmx_simd4_setzero_pr    _mm_setzero_ps
+#define gmx_simd4_store_pr      _mm_store_ps
+
+/* Some old AMD processors could have problems with unaligned loads+stores */
+#ifndef GMX_FAHCORE
+#define GMX_SIMD4_HAVE_UNALIGNED
+#endif
+#ifdef GMX_SIMD4_HAVE_UNALIGNED
+#define gmx_simd4_loadu_pr      _mm_loadu_ps
+#define gmx_simd4_storeu_pr     _mm_storeu_ps
+#endif
+
+#define gmx_simd4_add_pr        _mm_add_ps
+#define gmx_simd4_sub_pr        _mm_sub_ps
+#define gmx_simd4_mul_pr        _mm_mul_ps
+
+#ifdef GMX_X86_AVX_128_FMA
+#define gmx_simd4_madd_pr(a, b, c)   _mm_macc_ps(a, b, c)
+#define gmx_simd4_nmsub_pr(a, b, c)  _mm_nmacc_ps(a, b, c)
+#else
+#define gmx_simd4_madd_pr(a, b, c)   _mm_add_ps(c, _mm_mul_ps(a, b))
+#define gmx_simd4_nmsub_pr(a, b, c)  _mm_sub_ps(c, _mm_mul_ps(a, b))
+#endif
+
+static inline float gmx_simd4_dotproduct3(__m128 a, __m128 b)
+#ifdef GMX_X86_SSE4_1
+{
+    float dp;
+
+    /* SSE4.1 dot product of components 0,1,2, stored in component 0 */
+    _mm_store_ss(&dp, _mm_dp_ps(a, b, 0x71));
+
+    return dp;
+}
+#else
+{
+    float        dp_array[7], *dp;
+
+    /* Generate an aligned pointer */
+    dp = (float *)(((size_t)(dp_array+3)) & (~((size_t)15)));
+
+    _mm_store_ps(dp, _mm_mul_ps(a, b));
+
+    return dp[0] + dp[1] + dp[2];
+}
+#endif
+
+#define gmx_simd4_min_pr        _mm_min_ps
+#define gmx_simd4_max_pr        _mm_max_ps
+
+#define gmx_simd4_blendzero_pr  _mm_and_ps
+
+#define gmx_simd4_cmplt_pr      _mm_cmplt_ps
+#define gmx_simd4_and_pb        _mm_and_ps
+#define gmx_simd4_or_pb         _mm_or_ps
+
+#define gmx_simd4_anytrue_pb    _mm_movemask_ps
+
+#endif /* GMX_SIMD4_SINGLE */
+
+
+#ifdef GMX_SIMD4_DOUBLE
+
+#define gmx_simd4_pr  __m256d
+
+#define gmx_simd4_pb  __m256d
+
+#define gmx_simd4_load_pr       _mm256_load_pd
+#define gmx_simd4_set1_pr       _mm256_set1_pd
+#define gmx_simd4_setzero_pr    _mm256_setzero_pd
+#define gmx_simd4_store_pr      _mm256_store_pd
+
+#define GMX_SIMD4_HAVE_UNALIGNED
+#define gmx_simd4_loadu_pr      _mm256_loadu_pd
+#define gmx_simd4_storeu_pr     _mm256_storeu_pd
+
+#define gmx_simd4_add_pr        _mm256_add_pd
+#define gmx_simd4_sub_pr        _mm256_sub_pd
+#define gmx_simd4_mul_pr        _mm256_mul_pd
+#ifdef GMX_X86_AVX_128_FMA
+#define gmx_simd4_madd_pr(a, b, c)   _mm256_macc_pd(a, b, c)
+#define gmx_simd4_nmsub_pr(a, b, c)  _mm256_nmacc_pd(a, b, c)
+#else
+#define gmx_simd4_madd_pr(a, b, c)   _mm256_add_pd(c, _mm256_mul_pd(a, b))
+#define gmx_simd4_nmsub_pr(a, b, c)  _mm256_sub_pd(c, _mm256_mul_pd(a, b))
+#endif
+#define gmx_simd4_min_pr        _mm256_min_pd
+#define gmx_simd4_max_pr        _mm256_max_pd
+
+#define gmx_simd4_blendzero_pr  _mm256_and_pd
+
+/* Less-than (we use ordered, non-signaling, but that's not required) */
+#define gmx_simd4_cmplt_pr(x, y) _mm256_cmp_pd(x, y, 0x11)
+#define gmx_simd4_and_pb        _mm256_and_pd
+#define gmx_simd4_or_pb         _mm256_or_pd
+
+#define gmx_simd4_anytrue_pb    _mm256_movemask_pd
+
+#endif /* GMX_SIMD4_DOUBLE */
+
+
+#endif /* GMX_HAVE_SIMD4_MACROS */
+
+
+#endif /* GMX_X86_SSE2 */
+
+
+#ifdef GMX_HAVE_SIMD4_MACROS
+/* Generic functions to extract a SIMD4 aligned pointer from a pointer x.
+ * x should have at least GMX_SIMD4_WIDTH=4 elements extra compared
+ * to how many you want to use, to avoid indexing outside the aligned region.
+ */
+
+static gmx_inline gmx_simd4_real *
+gmx_simd4_align_real(const gmx_simd4_real *x)
+{
+    return (gmx_simd4_real *)(((size_t)((x)+GMX_SIMD4_WIDTH)) & (~((size_t)(GMX_SIMD4_WIDTH*sizeof(gmx_simd4_real)-1))));
+}
+#endif
+
+
+#endif /* _gmx_simd4_macros_h_ */
diff --git a/include/gmx_simd4_ref.h b/include/gmx_simd4_ref.h
new file mode 100644
index 0000000000..192a6105a9
--- /dev/null
+++ b/include/gmx_simd4_ref.h
@@ -0,0 +1,308 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ * Copyright (c) 2012,2013, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+#ifndef _gmx_simd4_ref_h_
+#define _gmx_simd4_ref_h_
+
+/* This file contains a reference plain-C implementation of 4-wide SIMD.
+ * This code is only useful for testing and documentation.
+ * Either float or double precision is supported through gmx_simd4_real,
+ * which is set in gmx_simd4_macros.h
+ */
+
+
+#include <math.h>
+
+/* float/double SIMD register type */
+typedef struct {
+    gmx_simd4_real r[GMX_SIMD4_WIDTH];
+} gmx_simd4_ref_pr;
+
+/* boolean SIMD register type */
+typedef struct {
+    char r[GMX_SIMD4_WIDTH];
+} gmx_simd4_ref_pb;
+
+
+/* Load GMX_SIMD4_WIDTH reals for memory starting at r */
+static gmx_inline gmx_simd4_ref_pr
+gmx_simd4_ref_load_pr(const gmx_simd4_real *r)
+{
+    gmx_simd4_ref_pr a;
+    int              i;
+
+    for (i = 0; i < GMX_SIMD4_WIDTH; i++)
+    {
+        a.r[i] = r[i];
+    }
+
+    return a;
+}
+
+/* Set all SIMD register elements to r */
+static gmx_inline gmx_simd4_ref_pr
+gmx_simd4_ref_set1_pr(gmx_simd4_real r)
+{
+    gmx_simd4_ref_pr a;
+    int              i;
+
+    for (i = 0; i < GMX_SIMD4_WIDTH; i++)
+    {
+        a.r[i] = r;
+    }
+
+    return a;
+}
+
+/* Set all SIMD register elements to 0 */
+static gmx_inline gmx_simd4_ref_pr
+gmx_simd4_ref_setzero_pr()
+{
+    gmx_simd4_ref_pr a;
+    int              i;
+
+    for (i = 0; i < GMX_SIMD4_WIDTH; i++)
+    {
+        a.r[i] = 0.0;
+    }
+
+    return a;
+}
+
+static gmx_inline void
+gmx_simd4_ref_store_pr(gmx_simd4_real *dest, gmx_simd4_ref_pr src)
+{
+    int i;
+
+    for (i = 0; i < GMX_SIMD4_WIDTH; i++)
+    {
+        dest[i] = src.r[i];
+    }
+}
+
+static gmx_inline gmx_simd4_ref_pr
+gmx_simd4_ref_add_pr(gmx_simd4_ref_pr a, gmx_simd4_ref_pr b)
+{
+    gmx_simd4_ref_pr c;
+    int              i;
+
+    for (i = 0; i < GMX_SIMD4_WIDTH; i++)
+    {
+        c.r[i] = a.r[i] + b.r[i];
+    }
+
+    return c;
+}
+
+static gmx_inline gmx_simd4_ref_pr
+gmx_simd4_ref_sub_pr(gmx_simd4_ref_pr a, gmx_simd4_ref_pr b)
+{
+    gmx_simd4_ref_pr c;
+    int              i;
+
+    for (i = 0; i < GMX_SIMD4_WIDTH; i++)
+    {
+        c.r[i] = a.r[i] - b.r[i];
+    }
+
+    return c;
+}
+
+static gmx_inline gmx_simd4_ref_pr
+gmx_simd4_ref_mul_pr(gmx_simd4_ref_pr a, gmx_simd4_ref_pr b)
+{
+    gmx_simd4_ref_pr c;
+    int              i;
+
+    for (i = 0; i < GMX_SIMD4_WIDTH; i++)
+    {
+        c.r[i] = a.r[i]*b.r[i];
+    }
+
+    return c;
+}
+
+static gmx_inline gmx_simd4_ref_pr
+gmx_simd4_ref_madd_pr(gmx_simd4_ref_pr a, gmx_simd4_ref_pr b, gmx_simd4_ref_pr c)
+{
+    gmx_simd4_ref_pr d;
+    int              i;
+
+    for (i = 0; i < GMX_SIMD4_WIDTH; i++)
+    {
+        d.r[i] = a.r[i]*b.r[i] + c.r[i];
+    }
+
+    return d;
+}
+
+static gmx_inline gmx_simd4_ref_pr
+gmx_simd4_ref_nmsub_pr(gmx_simd4_ref_pr a, gmx_simd4_ref_pr b, gmx_simd4_ref_pr c)
+{
+    gmx_simd4_ref_pr d;
+    int             i;
+
+    for (i = 0; i < GMX_SIMD4_WIDTH; i++)
+    {
+        d.r[i] = -a.r[i]*b.r[i] + c.r[i];
+    }
+
+    return d;
+}
+
+static gmx_inline gmx_simd4_real
+gmx_simd4_ref_dotproduct3(gmx_simd4_ref_pr a, gmx_simd4_ref_pr b)
+{
+    gmx_simd4_real dp;
+    int            i;
+
+    dp = 0.0;
+    for (i = 0; i < 3; i++)
+    {
+        dp += a.r[i]*b.r[i];
+    }
+
+    return dp;
+}
+
+static gmx_inline gmx_simd4_ref_pr
+gmx_simd4_ref_min_pr(gmx_simd4_ref_pr a, gmx_simd4_ref_pr b)
+{
+    gmx_simd4_ref_pr c;
+    int              i;
+
+    for (i = 0; i < GMX_SIMD4_WIDTH; i++)
+    {
+        c.r[i] = (a.r[i] <= b.r[i] ? a.r[i] : b.r[i]);
+    }
+
+    return c;
+}
+
+static gmx_inline gmx_simd4_ref_pr
+gmx_simd4_ref_max_pr(gmx_simd4_ref_pr a, gmx_simd4_ref_pr b)
+{
+    gmx_simd4_ref_pr c;
+    int              i;
+
+    for (i = 0; i < GMX_SIMD4_WIDTH; i++)
+    {
+        c.r[i] = (a.r[i] >= b.r[i] ? a.r[i] : b.r[i]);
+    }
+
+    return c;
+}
+
+static gmx_inline gmx_simd4_ref_pr
+gmx_simd4_ref_blendzero_pr(gmx_simd4_ref_pr a, gmx_simd4_ref_pb b)
+{
+    gmx_simd4_ref_pr c;
+    int              i;
+
+    for (i = 0; i < GMX_SIMD4_WIDTH; i++)
+    {
+        c.r[i] = (b.r[i] ? a.r[i] : 0.0);
+    }
+
+    return c;
+}
+
+/* Comparison */
+static gmx_inline gmx_simd4_ref_pb
+gmx_simd4_ref_cmplt_pr(gmx_simd4_ref_pr a, gmx_simd4_ref_pr b)
+{
+    gmx_simd4_ref_pb c;
+    int              i;
+
+    for (i = 0; i < GMX_SIMD4_WIDTH; i++)
+    {
+        c.r[i] = (a.r[i] < b.r[i]);
+    }
+
+    return c;
+}
+
+/* Logical AND on SIMD booleans */
+static gmx_inline gmx_simd4_ref_pb
+gmx_simd4_ref_and_pb(gmx_simd4_ref_pb a, gmx_simd4_ref_pb b)
+{
+    gmx_simd4_ref_pb c;
+    int              i;
+
+    for (i = 0; i < GMX_SIMD4_WIDTH; i++)
+    {
+        c.r[i] = (a.r[i] && b.r[i]);
+    }
+
+    return c;
+}
+
+/* Logical OR on SIMD booleans */
+static gmx_inline gmx_simd4_ref_pb
+gmx_simd4_ref_or_pb(gmx_simd4_ref_pb a, gmx_simd4_ref_pb b)
+{
+    gmx_simd4_ref_pb c;
+    int              i;
+
+    for (i = 0; i < GMX_SIMD4_WIDTH; i++)
+    {
+        c.r[i] = (a.r[i] || b.r[i]);
+    }
+
+    return c;
+}
+
+/* gmx_anytrue_pb(x) returns if any of the boolean is x is True */
+static gmx_inline int
+gmx_simd4_ref_anytrue_pb(gmx_simd4_ref_pb a)
+{
+    int anytrue;
+    int i;
+
+    anytrue = 0;
+    for (i = 0; i < GMX_SIMD4_WIDTH; i++)
+    {
+        if (a.r[i])
+        {
+            anytrue = 1;
+        }
+    }
+
+    return anytrue;
+}
+
+#endif /* _gmx_simd4_ref_h_ */
diff --git a/include/gmx_simd_macros.h b/include/gmx_simd_macros.h
index c997b39b51..d487565b9a 100644
--- a/include/gmx_simd_macros.h
+++ b/include/gmx_simd_macros.h
@@ -121,17 +121,8 @@
 #define gmx_and_pb        gmx_simd_ref_and_pb
 #define gmx_or_pb         gmx_simd_ref_or_pb
 
-/* Not required, gmx_anytrue_pb(x) returns if any of the boolean is x is True.
- * If this is not present, define GMX_SIMD_IS_TRUE(real x),
- * which should return x==True, where True is True as defined in SIMD.
- */
-#define GMX_SIMD_HAVE_ANYTRUE
-#ifdef GMX_SIMD_HAVE_ANYTRUE
+/* Returns a single int (0/1) which tells if any of the 4 booleans is True */
 #define gmx_anytrue_pb    gmx_simd_ref_anytrue_pb
-#else
-/* If we don't have gmx_anytrue_pb, we need to store gmx_mm_pb */
-#define gmx_store_pb      gmx_simd_ref_store_pb
-#endif
 
 /* Conversions only used for PME table lookup */
 #define gmx_cvttpr_epi32  gmx_simd_ref_cvttpr_epi32
@@ -283,7 +274,6 @@ static gmx_inline gmx_mm_pr gmx_cpsgn_nonneg_pr(gmx_mm_pr a, gmx_mm_pr b)
 
 static gmx_inline gmx_mm_pr gmx_masknot_add_pr(gmx_mm_pb a, gmx_mm_pr b, gmx_mm_pr c) { return _mm_add_ps(b, _mm_andnot_ps(a, c)); };
 
-#define GMX_SIMD_HAVE_ANYTRUE
 #define gmx_anytrue_pb    _mm_movemask_ps
 
 #define gmx_cvttpr_epi32  _mm_cvttps_epi32
@@ -356,7 +346,6 @@ static gmx_inline gmx_mm_pr gmx_masknot_add_pr(gmx_mm_pb a, gmx_mm_pr b, gmx_mm_
 #define gmx_and_pb        _mm_and_pd
 #define gmx_or_pb         _mm_or_pd
 
-#define GMX_SIMD_HAVE_ANYTRUE
 #define gmx_anytrue_pb    _mm_movemask_pd
 
 #define gmx_cvttpr_epi32  _mm_cvttpd_epi32
@@ -423,7 +412,6 @@ static gmx_inline gmx_mm_pr gmx_masknot_add_pr(gmx_mm_pb a, gmx_mm_pr b, gmx_mm_
 #define gmx_and_pb        _mm256_and_ps
 #define gmx_or_pb         _mm256_or_ps
 
-#define GMX_SIMD_HAVE_ANYTRUE
 #define gmx_anytrue_pb    _mm256_movemask_ps
 
 #define gmx_cvttpr_epi32  _mm256_cvttps_epi32
@@ -484,7 +472,6 @@ static gmx_inline gmx_mm_pr gmx_masknot_add_pr(gmx_mm_pb a, gmx_mm_pr b, gmx_mm_
 #define gmx_and_pb        _mm256_and_pd
 #define gmx_or_pb         _mm256_or_pd
 
-#define GMX_SIMD_HAVE_ANYTRUE
 #define gmx_anytrue_pb    _mm256_movemask_pd
 
 #define gmx_cvttpr_epi32  _mm256_cvttpd_epi32
diff --git a/include/gmx_simd_ref.h b/include/gmx_simd_ref.h
index 3a3900a821..082132ed5c 100644
--- a/include/gmx_simd_ref.h
+++ b/include/gmx_simd_ref.h
@@ -377,10 +377,7 @@ gmx_simd_ref_or_pb(gmx_simd_ref_pb a, gmx_simd_ref_pb b)
     return c;
 }
 
-/* Not required, gmx_anytrue_pb(x) returns if any of the boolean is x is True.
- * If this is not present, define GMX_SIMD_IS_TRUE(real x),
- * which should return x==True, where True is True as defined in SIMD.
- */
+/* Returns a single int (0/1) which tells if any of the booleans is True */
 static gmx_inline int
 gmx_simd_ref_anytrue_pb(gmx_simd_ref_pb a)
 {
@@ -399,18 +396,6 @@ gmx_simd_ref_anytrue_pb(gmx_simd_ref_pb a)
     return anytrue;
 }
 
-/* If we don't have gmx_anytrue_pb, we need to store gmx_mm_pb */
-static gmx_inline void
-gmx_simd_ref_store_pb(real *dest, gmx_simd_ref_pb src)
-{
-    int i;
-
-    for (i = 0; i < GMX_SIMD_REF_WIDTH; i++)
-    {
-        dest[i] = src.r[i];
-    }
-};
-
 /* Conversions only used for PME table lookup */
 static gmx_inline gmx_simd_ref_epi32
 gmx_simd_ref_cvttpr_epi32(gmx_simd_ref_pr a)
diff --git a/src/mdlib/nbnxn_internal.h b/src/mdlib/nbnxn_internal.h
index 833407e1dc..ae76083faf 100644
--- a/src/mdlib/nbnxn_internal.h
+++ b/src/mdlib/nbnxn_internal.h
@@ -53,14 +53,21 @@
 #include "gmx_simd_macros.h"
 #endif
 
-#ifdef __cplusplus
-extern "C" {
+
+/* Bounding box calculations are (currently) always in single precision.
+ * This uses less (cache-)memory and SIMD is faster, at least on x86.
+ */
+#define GMX_SIMD4_SINGLE
+/* Include the 4-wide SIMD macro file */
+#include "gmx_simd4_macros.h"
+/* Check if we have 4-wide SIMD macro support */
+#ifdef GMX_HAVE_SIMD4_MACROS
+#define NBNXN_SEARCH_BB_SIMD4
 #endif
 
 
-#ifdef GMX_X86_SSE2
-/* Use 4-way SIMD for, always, single precision bounding box calculations */
-#define NBNXN_SEARCH_BB_SSE
+#ifdef __cplusplus
+extern "C" {
 #endif
 
 
@@ -73,9 +80,18 @@ extern "C" {
 #endif
 
 
+#ifdef NBNXN_SEARCH_BB_SIMD4
+/* Memory alignment in bytes as required by SIMD aligned loads/stores */
+#define NBNXN_SEARCH_BB_MEM_ALIGN  (GMX_SIMD4_WIDTH*sizeof(float))
+#else
+/* No alignment required, but set it so we can call the same routines */
+#define NBNXN_SEARCH_BB_MEM_ALIGN  32
+#endif
+
+
 /* Pair search box lower and upper corner in x,y,z.
- * Store this in 4 iso 3 reals, which is useful with SSE.
- * To avoid complicating the code we also use 4 without SSE.
+ * Store this in 4 iso 3 reals, which is useful with 4-wide SIMD.
+ * To avoid complicating the code we also use 4 without 4-wide SIMD.
  */
 #define NNBSBB_C         4
 /* Pair search box lower and upper bound in z only. */
@@ -122,7 +138,8 @@ typedef struct {
     int        *nsubc;         /* The number of sub cells for each super cell */
     float      *bbcz;          /* Bounding boxes in z for the super cells     */
     nbnxn_bb_t *bb;            /* 3D bounding boxes for the sub cells         */
-    nbnxn_bb_t *bbj;           /* 3D j-b.boxes for SSE-double or AVX-single   */
+    nbnxn_bb_t *bbj;           /* 3D j-bounding boxes for the case where      *
+                                * the i- and j-cluster sizes are different    */
     float      *pbb;           /* 3D b. boxes in xxxx format per super cell   */
     int        *flags;         /* Flag for the super cells                    */
     int         nc_nalloc;     /* Allocation size for the pointers above      */
@@ -139,16 +156,16 @@ typedef struct {
 
 typedef struct nbnxn_x_ci_simd_4xn {
     /* The i-cluster coordinates for simple search */
-    gmx_mm_pr ix_SSE0, iy_SSE0, iz_SSE0;
-    gmx_mm_pr ix_SSE1, iy_SSE1, iz_SSE1;
-    gmx_mm_pr ix_SSE2, iy_SSE2, iz_SSE2;
-    gmx_mm_pr ix_SSE3, iy_SSE3, iz_SSE3;
+    gmx_mm_pr ix_S0, iy_S0, iz_S0;
+    gmx_mm_pr ix_S1, iy_S1, iz_S1;
+    gmx_mm_pr ix_S2, iy_S2, iz_S2;
+    gmx_mm_pr ix_S3, iy_S3, iz_S3;
 } nbnxn_x_ci_simd_4xn_t;
 
 typedef struct nbnxn_x_ci_simd_2xnn {
     /* The i-cluster coordinates for simple search */
-    gmx_mm_pr ix_SSE0, iy_SSE0, iz_SSE0;
-    gmx_mm_pr ix_SSE2, iy_SSE2, iz_SSE2;
+    gmx_mm_pr ix_S0, iy_S0, iz_S0;
+    gmx_mm_pr ix_S2, iy_S2, iz_S2;
 } nbnxn_x_ci_simd_2xnn_t;
 
 #endif
diff --git a/src/mdlib/nbnxn_search.c b/src/mdlib/nbnxn_search.c
index 738ffac644..ac23d9bc59 100644
--- a/src/mdlib/nbnxn_search.c
+++ b/src/mdlib/nbnxn_search.c
@@ -62,29 +62,26 @@
 #include "nrnb.h"
 
 
-#ifdef NBNXN_SEARCH_BB_SSE
-/* We use SSE or AVX-128bit for bounding box calculations */
+#ifdef NBNXN_SEARCH_BB_SIMD4
+/* We use 4-wide SIMD for bounding box calculations */
 
 #ifndef GMX_DOUBLE
-/* Single precision BBs + coordinates, we can also load coordinates using SSE */
-#define NBNXN_SEARCH_SSE_SINGLE
+/* Single precision BBs + coordinates, we can also load coordinates with SIMD */
+#define NBNXN_SEARCH_SIMD4_FLOAT_X_BB
 #endif
 
-/* Include basic SSE2 stuff */
-#include <emmintrin.h>
-
-#if defined NBNXN_SEARCH_SSE_SINGLE && (GPU_NSUBCELL == 4 || GPU_NSUBCELL == 8)
+#if defined NBNXN_SEARCH_SIMD4_FLOAT_X_BB && (GPU_NSUBCELL == 4 || GPU_NSUBCELL == 8)
 /* Store bounding boxes with x, y and z coordinates in packs of 4 */
-#define NBNXN_PBB_SSE
+#define NBNXN_PBB_SIMD4
 #endif
 
-/* The width of SSE/AVX128 with single precision for bounding boxes with GPU.
- * Here AVX-256 turns out to be slightly slower than AVX-128.
+/* The packed bounding box coordinate stride is always set to 4.
+ * With AVX we could use 8, but that turns out not to be faster.
  */
 #define STRIDE_PBB        4
 #define STRIDE_PBB_2LOG   2
 
-#endif /* NBNXN_SEARCH_BB_SSE */
+#endif /* NBNXN_SEARCH_BB_SIMD4 */
 
 #ifdef GMX_NBNXN_SIMD
 
@@ -145,7 +142,7 @@
 #endif /* GMX_NBNXN_SIMD */
 
 
-#ifdef NBNXN_SEARCH_BB_SSE
+#ifdef NBNXN_SEARCH_BB_SIMD4
 /* Store bounding boxes corners as quadruplets: xxxxyyyyzzzz */
 #define NBNXN_BBXXXX
 /* Size of bounding box corners quadruplet */
@@ -497,7 +494,7 @@ static int set_grid_size_xy(const nbnxn_search_t nbs,
 
         sfree_aligned(grid->bb);
         /* This snew also zeros the contents, this avoid possible
-         * floating exceptions in SSE with the unused bb elements.
+         * floating exceptions in SIMD with the unused bb elements.
          */
         if (grid->bSimple)
         {
@@ -805,19 +802,21 @@ static void calc_bounding_box_x_x4_halves(int na, const real *x,
         /* Set the "empty" bounding box to the same as the first one,
          * so we don't need to treat special cases in the rest of the code.
          */
-#ifdef NBNXN_SEARCH_BB_SSE
-        _mm_store_ps(&bbj[1].lower[0], _mm_load_ps(&bbj[0].lower[0]));
-        _mm_store_ps(&bbj[1].upper[0], _mm_load_ps(&bbj[0].upper[0]));
+#ifdef NBNXN_SEARCH_BB_SIMD4
+        gmx_simd4_store_pr(&bbj[1].lower[0], gmx_simd4_load_pr(&bbj[0].lower[0]));
+        gmx_simd4_store_pr(&bbj[1].upper[0], gmx_simd4_load_pr(&bbj[0].upper[0]));
 #else
         bbj[1] = bbj[0];
 #endif
     }
 
-#ifdef NBNXN_SEARCH_BB_SSE
-    _mm_store_ps(&bb->lower[0], _mm_min_ps(_mm_load_ps(&bbj[0].lower[0]),
-                                           _mm_load_ps(&bbj[1].lower[0])));
-    _mm_store_ps(&bb->upper[0], _mm_max_ps(_mm_load_ps(&bbj[0].upper[0]),
-                                           _mm_load_ps(&bbj[1].upper[0])));
+#ifdef NBNXN_SEARCH_BB_SIMD4
+    gmx_simd4_store_pr(&bb->lower[0],
+                       gmx_simd4_min_pr(gmx_simd4_load_pr(&bbj[0].lower[0]),
+                                        gmx_simd4_load_pr(&bbj[1].lower[0])));
+    gmx_simd4_store_pr(&bb->upper[0],
+                       gmx_simd4_max_pr(gmx_simd4_load_pr(&bbj[0].upper[0]),
+                                        gmx_simd4_load_pr(&bbj[1].upper[0])));
 #else
     {
         int i;
@@ -831,7 +830,7 @@ static void calc_bounding_box_x_x4_halves(int na, const real *x,
 #endif
 }
 
-#ifdef NBNXN_SEARCH_BB_SSE
+#ifdef NBNXN_SEARCH_BB_SIMD4
 
 /* Coordinate order xyz, bb order xxxxyyyyzzzz */
 static void calc_bounding_box_xxxx(int na, int stride, const real *x, float *bb)
@@ -866,38 +865,38 @@ static void calc_bounding_box_xxxx(int na, int stride, const real *x, float *bb)
     bb[5*STRIDE_PBB] = R2F_U(zh);
 }
 
-#endif /* NBNXN_SEARCH_BB_SSE */
+#endif /* NBNXN_SEARCH_BB_SIMD4 */
 
-#ifdef NBNXN_SEARCH_SSE_SINGLE
+#ifdef NBNXN_SEARCH_SIMD4_FLOAT_X_BB
 
 /* Coordinate order xyz?, bb order xyz0 */
-static void calc_bounding_box_sse(int na, const float *x, nbnxn_bb_t *bb)
+static void calc_bounding_box_simd4(int na, const float *x, nbnxn_bb_t *bb)
 {
-    __m128 bb_0_SSE, bb_1_SSE;
-    __m128 x_SSE;
+    gmx_simd4_pr bb_0_S, bb_1_S;
+    gmx_simd4_pr x_S;
 
     int    i;
 
-    bb_0_SSE = _mm_load_ps(x);
-    bb_1_SSE = bb_0_SSE;
+    bb_0_S = gmx_simd4_load_pr(x);
+    bb_1_S = bb_0_S;
 
     for (i = 1; i < na; i++)
     {
-        x_SSE    = _mm_load_ps(x+i*NNBSBB_C);
-        bb_0_SSE = _mm_min_ps(bb_0_SSE, x_SSE);
-        bb_1_SSE = _mm_max_ps(bb_1_SSE, x_SSE);
+        x_S    = gmx_simd4_load_pr(x+i*NNBSBB_C);
+        bb_0_S = gmx_simd4_min_pr(bb_0_S, x_S);
+        bb_1_S = gmx_simd4_max_pr(bb_1_S, x_S);
     }
 
-    _mm_store_ps(&bb->lower[0], bb_0_SSE);
-    _mm_store_ps(&bb->upper[0], bb_1_SSE);
+    gmx_simd4_store_pr(&bb->lower[0], bb_0_S);
+    gmx_simd4_store_pr(&bb->upper[0], bb_1_S);
 }
 
 /* Coordinate order xyz?, bb order xxxxyyyyzzzz */
-static void calc_bounding_box_xxxx_sse(int na, const float *x,
-                                       nbnxn_bb_t *bb_work_aligned,
-                                       real *bb)
+static void calc_bounding_box_xxxx_simd4(int na, const float *x,
+                                         nbnxn_bb_t *bb_work_aligned,
+                                         real *bb)
 {
-    calc_bounding_box_sse(na, x, bb_work_aligned);
+    calc_bounding_box_simd4(na, x, bb_work_aligned);
 
     bb[0*STRIDE_PBB] = bb_work_aligned->lower[BB_X];
     bb[1*STRIDE_PBB] = bb_work_aligned->lower[BB_Y];
@@ -907,7 +906,7 @@ static void calc_bounding_box_xxxx_sse(int na, const float *x,
     bb[5*STRIDE_PBB] = bb_work_aligned->upper[BB_Z];
 }
 
-#endif /* NBNXN_SEARCH_SSE_SINGLE */
+#endif /* NBNXN_SEARCH_SIMD4_FLOAT_X_BB */
 
 
 /* Combines pairs of consecutive bounding boxes */
@@ -923,15 +922,15 @@ static void combine_bounding_box_pairs(nbnxn_grid_t *grid, const nbnxn_bb_t *bb)
         nc2 = (grid->cxy_na[i]+3)>>(2+1);
         for (c2 = sc2; c2 < sc2+nc2; c2++)
         {
-#ifdef NBNXN_SEARCH_BB_SSE
-            __m128 min_SSE, max_SSE;
-
-            min_SSE = _mm_min_ps(_mm_load_ps(&bb[c2*2+0].lower[0]),
-                                 _mm_load_ps(&bb[c2*2+1].lower[0]));
-            max_SSE = _mm_max_ps(_mm_load_ps(&bb[c2*2+0].upper[0]),
-                                 _mm_load_ps(&bb[c2*2+1].upper[0]));
-            _mm_store_ps(&grid->bbj[c2].lower[0], min_SSE);
-            _mm_store_ps(&grid->bbj[c2].upper[0], max_SSE);
+#ifdef NBNXN_SEARCH_BB_SIMD4
+            gmx_simd4_pr min_S, max_S;
+
+            min_S = gmx_simd4_min_pr(gmx_simd4_load_pr(&bb[c2*2+0].lower[0]),
+                                     gmx_simd4_load_pr(&bb[c2*2+1].lower[0]));
+            max_S = gmx_simd4_max_pr(gmx_simd4_load_pr(&bb[c2*2+0].upper[0]),
+                                     gmx_simd4_load_pr(&bb[c2*2+1].upper[0]));
+            gmx_simd4_store_pr(&grid->bbj[c2].lower[0], min_S);
+            gmx_simd4_store_pr(&grid->bbj[c2].upper[0], max_S);
 #else
             for (j = 0; j < NNBSBB_C; j++)
             {
@@ -1176,18 +1175,18 @@ void fill_cell(const nbnxn_search_t nbs,
     else if (!grid->bSimple)
     {
         /* Store the bounding boxes in a format convenient
-         * for SSE calculations: xxxxyyyyzzzz...
+         * for SIMD4 calculations: xxxxyyyyzzzz...
          */
         pbb_ptr =
             grid->pbb +
             ((a0-grid->cell0*grid->na_sc)>>(grid->na_c_2log+STRIDE_PBB_2LOG))*NNBSBB_XXXX +
             (((a0-grid->cell0*grid->na_sc)>>grid->na_c_2log) & (STRIDE_PBB-1));
 
-#ifdef NBNXN_SEARCH_SSE_SINGLE
+#ifdef NBNXN_SEARCH_SIMD4_FLOAT_X_BB
         if (nbat->XFormat == nbatXYZQ)
         {
-            calc_bounding_box_xxxx_sse(na, nbat->x+a0*nbat->xstride,
-                                       bb_work_aligned, pbb_ptr);
+            calc_bounding_box_xxxx_simd4(na, nbat->x+a0*nbat->xstride,
+                                         bb_work_aligned, pbb_ptr);
         }
         else
 #endif
@@ -2065,134 +2064,114 @@ static float subc_bb_dist2(int si, const nbnxn_bb_t *bb_i_ci,
     return d2;
 }
 
-#ifdef NBNXN_SEARCH_BB_SSE
+#ifdef NBNXN_SEARCH_BB_SIMD4
 
-/* SSE code for bb distance for bb format xyz0 */
-static float subc_bb_dist2_sse(int si, const nbnxn_bb_t *bb_i_ci,
-                               int csj, const nbnxn_bb_t *bb_j_all)
+/* 4-wide SIMD code for bb distance for bb format xyz0 */
+static float subc_bb_dist2_simd4(int si, const nbnxn_bb_t *bb_i_ci,
+                                 int csj, const nbnxn_bb_t *bb_j_all)
 {
-    __m128       bb_i_SSE0, bb_i_SSE1;
-    __m128       bb_j_SSE0, bb_j_SSE1;
-    __m128       dl_SSE;
-    __m128       dh_SSE;
-    __m128       dm_SSE;
-    __m128       dm0_SSE;
-    __m128       d2_SSE;
-#ifndef GMX_X86_SSE4_1
-    float        d2_array[7], *d2_align;
-
-    d2_align = (float *)(((size_t)(d2_array+3)) & (~((size_t)15)));
-#else
-    float d2;
-#endif
-
-    bb_i_SSE0 = _mm_load_ps(&bb_i_ci[si].lower[0]);
-    bb_i_SSE1 = _mm_load_ps(&bb_i_ci[si].upper[0]);
-    bb_j_SSE0 = _mm_load_ps(&bb_j_all[csj].lower[0]);
-    bb_j_SSE1 = _mm_load_ps(&bb_j_all[csj].upper[0]);
-
-    dl_SSE    = _mm_sub_ps(bb_i_SSE0, bb_j_SSE1);
-    dh_SSE    = _mm_sub_ps(bb_j_SSE0, bb_i_SSE1);
+    gmx_simd4_pr bb_i_S0, bb_i_S1;
+    gmx_simd4_pr bb_j_S0, bb_j_S1;
+    gmx_simd4_pr dl_S;
+    gmx_simd4_pr dh_S;
+    gmx_simd4_pr dm_S;
+    gmx_simd4_pr dm0_S;
 
-    dm_SSE    = _mm_max_ps(dl_SSE, dh_SSE);
-    dm0_SSE   = _mm_max_ps(dm_SSE, _mm_setzero_ps());
-#ifndef GMX_X86_SSE4_1
-    d2_SSE    = _mm_mul_ps(dm0_SSE, dm0_SSE);
+    bb_i_S0 = gmx_simd4_load_pr(&bb_i_ci[si].lower[0]);
+    bb_i_S1 = gmx_simd4_load_pr(&bb_i_ci[si].upper[0]);
+    bb_j_S0 = gmx_simd4_load_pr(&bb_j_all[csj].lower[0]);
+    bb_j_S1 = gmx_simd4_load_pr(&bb_j_all[csj].upper[0]);
 
-    _mm_store_ps(d2_align, d2_SSE);
+    dl_S    = gmx_simd4_sub_pr(bb_i_S0, bb_j_S1);
+    dh_S    = gmx_simd4_sub_pr(bb_j_S0, bb_i_S1);
 
-    return d2_align[0] + d2_align[1] + d2_align[2];
-#else
-    /* SSE4.1 dot product of components 0,1,2 */
-    d2_SSE    = _mm_dp_ps(dm0_SSE, dm0_SSE, 0x71);
-
-    _mm_store_ss(&d2, d2_SSE);
+    dm_S    = gmx_simd4_max_pr(dl_S, dh_S);
+    dm0_S   = gmx_simd4_max_pr(dm_S, gmx_simd4_setzero_pr());
 
-    return d2;
-#endif
+    return gmx_simd4_dotproduct3(dm0_S, dm0_S);
 }
 
 /* Calculate bb bounding distances of bb_i[si,...,si+3] and store them in d2 */
-#define SUBC_BB_DIST2_SSE_XXXX_INNER(si, bb_i, d2) \
+#define SUBC_BB_DIST2_SIMD4_XXXX_INNER(si, bb_i, d2) \
     {                                                \
         int    shi;                                  \
                                                  \
-        __m128 dx_0, dy_0, dz_0;                       \
-        __m128 dx_1, dy_1, dz_1;                       \
+        gmx_simd4_pr dx_0, dy_0, dz_0;                       \
+        gmx_simd4_pr dx_1, dy_1, dz_1;                       \
                                                  \
-        __m128 mx, my, mz;                             \
-        __m128 m0x, m0y, m0z;                          \
+        gmx_simd4_pr mx, my, mz;                             \
+        gmx_simd4_pr m0x, m0y, m0z;                          \
                                                  \
-        __m128 d2x, d2y, d2z;                          \
-        __m128 d2s, d2t;                              \
+        gmx_simd4_pr d2x, d2y, d2z;                          \
+        gmx_simd4_pr d2s, d2t;                              \
                                                  \
         shi = si*NNBSBB_D*DIM;                       \
                                                  \
-        xi_l = _mm_load_ps(bb_i+shi+0*STRIDE_PBB);   \
-        yi_l = _mm_load_ps(bb_i+shi+1*STRIDE_PBB);   \
-        zi_l = _mm_load_ps(bb_i+shi+2*STRIDE_PBB);   \
-        xi_h = _mm_load_ps(bb_i+shi+3*STRIDE_PBB);   \
-        yi_h = _mm_load_ps(bb_i+shi+4*STRIDE_PBB);   \
-        zi_h = _mm_load_ps(bb_i+shi+5*STRIDE_PBB);   \
+        xi_l = gmx_simd4_load_pr(bb_i+shi+0*STRIDE_PBB);   \
+        yi_l = gmx_simd4_load_pr(bb_i+shi+1*STRIDE_PBB);   \
+        zi_l = gmx_simd4_load_pr(bb_i+shi+2*STRIDE_PBB);   \
+        xi_h = gmx_simd4_load_pr(bb_i+shi+3*STRIDE_PBB);   \
+        yi_h = gmx_simd4_load_pr(bb_i+shi+4*STRIDE_PBB);   \
+        zi_h = gmx_simd4_load_pr(bb_i+shi+5*STRIDE_PBB);   \
                                                  \
-        dx_0 = _mm_sub_ps(xi_l, xj_h);                \
-        dy_0 = _mm_sub_ps(yi_l, yj_h);                \
-        dz_0 = _mm_sub_ps(zi_l, zj_h);                \
+        dx_0 = gmx_simd4_sub_pr(xi_l, xj_h);                \
+        dy_0 = gmx_simd4_sub_pr(yi_l, yj_h);                \
+        dz_0 = gmx_simd4_sub_pr(zi_l, zj_h);                \
                                                  \
-        dx_1 = _mm_sub_ps(xj_l, xi_h);                \
-        dy_1 = _mm_sub_ps(yj_l, yi_h);                \
-        dz_1 = _mm_sub_ps(zj_l, zi_h);                \
+        dx_1 = gmx_simd4_sub_pr(xj_l, xi_h);                \
+        dy_1 = gmx_simd4_sub_pr(yj_l, yi_h);                \
+        dz_1 = gmx_simd4_sub_pr(zj_l, zi_h);                \
                                                  \
-        mx   = _mm_max_ps(dx_0, dx_1);                \
-        my   = _mm_max_ps(dy_0, dy_1);                \
-        mz   = _mm_max_ps(dz_0, dz_1);                \
+        mx   = gmx_simd4_max_pr(dx_0, dx_1);                \
+        my   = gmx_simd4_max_pr(dy_0, dy_1);                \
+        mz   = gmx_simd4_max_pr(dz_0, dz_1);                \
                                                  \
-        m0x  = _mm_max_ps(mx, zero);                  \
-        m0y  = _mm_max_ps(my, zero);                  \
-        m0z  = _mm_max_ps(mz, zero);                  \
+        m0x  = gmx_simd4_max_pr(mx, zero);                  \
+        m0y  = gmx_simd4_max_pr(my, zero);                  \
+        m0z  = gmx_simd4_max_pr(mz, zero);                  \
                                                  \
-        d2x  = _mm_mul_ps(m0x, m0x);                  \
-        d2y  = _mm_mul_ps(m0y, m0y);                  \
-        d2z  = _mm_mul_ps(m0z, m0z);                  \
+        d2x  = gmx_simd4_mul_pr(m0x, m0x);                  \
+        d2y  = gmx_simd4_mul_pr(m0y, m0y);                  \
+        d2z  = gmx_simd4_mul_pr(m0z, m0z);                  \
                                                  \
-        d2s  = _mm_add_ps(d2x, d2y);                  \
-        d2t  = _mm_add_ps(d2s, d2z);                  \
+        d2s  = gmx_simd4_add_pr(d2x, d2y);                  \
+        d2t  = gmx_simd4_add_pr(d2s, d2z);                  \
                                                  \
-        _mm_store_ps(d2+si, d2t);                     \
+        gmx_simd4_store_pr(d2+si, d2t);                     \
     }
 
-/* SSE code for nsi bb distances for bb format xxxxyyyyzzzz */
-static void subc_bb_dist2_sse_xxxx(const float *bb_j,
-                                   int nsi, const float *bb_i,
-                                   float *d2)
+/* 4-wide SIMD code for nsi bb distances for bb format xxxxyyyyzzzz */
+static void subc_bb_dist2_simd4_xxxx(const float *bb_j,
+                                     int nsi, const float *bb_i,
+                                     float *d2)
 {
-    __m128 xj_l, yj_l, zj_l;
-    __m128 xj_h, yj_h, zj_h;
-    __m128 xi_l, yi_l, zi_l;
-    __m128 xi_h, yi_h, zi_h;
+    gmx_simd4_pr xj_l, yj_l, zj_l;
+    gmx_simd4_pr xj_h, yj_h, zj_h;
+    gmx_simd4_pr xi_l, yi_l, zi_l;
+    gmx_simd4_pr xi_h, yi_h, zi_h;
 
-    __m128 zero;
+    gmx_simd4_pr zero;
 
-    zero = _mm_setzero_ps();
+    zero = gmx_simd4_setzero_pr();
 
-    xj_l = _mm_set1_ps(bb_j[0*STRIDE_PBB]);
-    yj_l = _mm_set1_ps(bb_j[1*STRIDE_PBB]);
-    zj_l = _mm_set1_ps(bb_j[2*STRIDE_PBB]);
-    xj_h = _mm_set1_ps(bb_j[3*STRIDE_PBB]);
-    yj_h = _mm_set1_ps(bb_j[4*STRIDE_PBB]);
-    zj_h = _mm_set1_ps(bb_j[5*STRIDE_PBB]);
+    xj_l = gmx_simd4_set1_pr(bb_j[0*STRIDE_PBB]);
+    yj_l = gmx_simd4_set1_pr(bb_j[1*STRIDE_PBB]);
+    zj_l = gmx_simd4_set1_pr(bb_j[2*STRIDE_PBB]);
+    xj_h = gmx_simd4_set1_pr(bb_j[3*STRIDE_PBB]);
+    yj_h = gmx_simd4_set1_pr(bb_j[4*STRIDE_PBB]);
+    zj_h = gmx_simd4_set1_pr(bb_j[5*STRIDE_PBB]);
 
     /* Here we "loop" over si (0,STRIDE_PBB) from 0 to nsi with step STRIDE_PBB.
      * But as we know the number of iterations is 1 or 2, we unroll manually.
      */
-    SUBC_BB_DIST2_SSE_XXXX_INNER(0, bb_i, d2);
+    SUBC_BB_DIST2_SIMD4_XXXX_INNER(0, bb_i, d2);
     if (STRIDE_PBB < nsi)
     {
-        SUBC_BB_DIST2_SSE_XXXX_INNER(STRIDE_PBB, bb_i, d2);
+        SUBC_BB_DIST2_SIMD4_XXXX_INNER(STRIDE_PBB, bb_i, d2);
     }
 }
 
-#endif /* NBNXN_SEARCH_BB_SSE */
+#endif /* NBNXN_SEARCH_BB_SIMD4 */
 
 /* Plain C function which determines if any atom pair between two cells
  * is within distance sqrt(rl2).
@@ -2226,44 +2205,44 @@ static gmx_bool subc_in_range_x(int na_c,
     return FALSE;
 }
 
-#ifdef NBNXN_SEARCH_SSE_SINGLE
+#ifdef NBNXN_SEARCH_SIMD4_FLOAT_X_BB
 /* When we make seperate single/double precision SIMD vector operation
  * include files, this function should be moved there (also using FMA).
  */
-static inline __m128
-gmx_mm_calc_rsq_ps(__m128 x, __m128 y, __m128 z)
+static inline gmx_simd4_pr
+gmx_simd4_calc_rsq_pr(gmx_simd4_pr x, gmx_simd4_pr y, gmx_simd4_pr z)
 {
-    return _mm_add_ps( _mm_add_ps( _mm_mul_ps(x, x), _mm_mul_ps(y, y) ), _mm_mul_ps(z, z) );
+    return gmx_simd4_add_pr( gmx_simd4_add_pr( gmx_simd4_mul_pr(x, x), gmx_simd4_mul_pr(y, y) ), gmx_simd4_mul_pr(z, z) );
 }
 #endif
 
-/* SSE function which determines if any atom pair between two cells,
+/* 4-wide SIMD function which determines if any atom pair between two cells,
  * both with 8 atoms, is within distance sqrt(rl2).
- * Not performance critical, so only uses plain SSE.
+ * Using 8-wide AVX is not faster on Intel Sandy Bridge.
  */
-static gmx_bool subc_in_range_sse8(int na_c,
-                                   int si, const real *x_i,
-                                   int csj, int stride, const real *x_j,
-                                   real rl2)
+static gmx_bool subc_in_range_simd4(int na_c,
+                                    int si, const real *x_i,
+                                    int csj, int stride, const real *x_j,
+                                    real rl2)
 {
-#ifdef NBNXN_SEARCH_SSE_SINGLE
-    __m128 ix_SSE0, iy_SSE0, iz_SSE0;
-    __m128 ix_SSE1, iy_SSE1, iz_SSE1;
+#ifdef NBNXN_SEARCH_SIMD4_FLOAT_X_BB
+    gmx_simd4_pr ix_S0, iy_S0, iz_S0;
+    gmx_simd4_pr ix_S1, iy_S1, iz_S1;
 
-    __m128 rc2_SSE;
+    gmx_simd4_pr rc2_S;
 
-    int    na_c_sse;
+    int    dim_stride;
     int    j0, j1;
 
-    rc2_SSE   = _mm_set1_ps(rl2);
+    rc2_S   = gmx_simd4_set1_pr(rl2);
 
-    na_c_sse = NBNXN_GPU_CLUSTER_SIZE/STRIDE_PBB;
-    ix_SSE0  = _mm_load_ps(x_i+(si*na_c_sse*DIM+0)*STRIDE_PBB);
-    iy_SSE0  = _mm_load_ps(x_i+(si*na_c_sse*DIM+1)*STRIDE_PBB);
-    iz_SSE0  = _mm_load_ps(x_i+(si*na_c_sse*DIM+2)*STRIDE_PBB);
-    ix_SSE1  = _mm_load_ps(x_i+(si*na_c_sse*DIM+3)*STRIDE_PBB);
-    iy_SSE1  = _mm_load_ps(x_i+(si*na_c_sse*DIM+4)*STRIDE_PBB);
-    iz_SSE1  = _mm_load_ps(x_i+(si*na_c_sse*DIM+5)*STRIDE_PBB);
+    dim_stride = NBNXN_GPU_CLUSTER_SIZE/STRIDE_PBB*DIM;
+    ix_S0      = gmx_simd4_load_pr(x_i+(si*dim_stride+0)*STRIDE_PBB);
+    iy_S0      = gmx_simd4_load_pr(x_i+(si*dim_stride+1)*STRIDE_PBB);
+    iz_S0      = gmx_simd4_load_pr(x_i+(si*dim_stride+2)*STRIDE_PBB);
+    ix_S1      = gmx_simd4_load_pr(x_i+(si*dim_stride+3)*STRIDE_PBB);
+    iy_S1      = gmx_simd4_load_pr(x_i+(si*dim_stride+4)*STRIDE_PBB);
+    iz_S1      = gmx_simd4_load_pr(x_i+(si*dim_stride+5)*STRIDE_PBB);
 
     /* We loop from the outer to the inner particles to maximize
      * the chance that we find a pair in range quickly and return.
@@ -2272,63 +2251,63 @@ static gmx_bool subc_in_range_sse8(int na_c,
     j1 = j0 + na_c - 1;
     while (j0 < j1)
     {
-        __m128 jx0_SSE, jy0_SSE, jz0_SSE;
-        __m128 jx1_SSE, jy1_SSE, jz1_SSE;
+        gmx_simd4_pr jx0_S, jy0_S, jz0_S;
+        gmx_simd4_pr jx1_S, jy1_S, jz1_S;
 
-        __m128 dx_SSE0, dy_SSE0, dz_SSE0;
-        __m128 dx_SSE1, dy_SSE1, dz_SSE1;
-        __m128 dx_SSE2, dy_SSE2, dz_SSE2;
-        __m128 dx_SSE3, dy_SSE3, dz_SSE3;
+        gmx_simd4_pr dx_S0, dy_S0, dz_S0;
+        gmx_simd4_pr dx_S1, dy_S1, dz_S1;
+        gmx_simd4_pr dx_S2, dy_S2, dz_S2;
+        gmx_simd4_pr dx_S3, dy_S3, dz_S3;
 
-        __m128 rsq_SSE0;
-        __m128 rsq_SSE1;
-        __m128 rsq_SSE2;
-        __m128 rsq_SSE3;
+        gmx_simd4_pr rsq_S0;
+        gmx_simd4_pr rsq_S1;
+        gmx_simd4_pr rsq_S2;
+        gmx_simd4_pr rsq_S3;
 
-        __m128 wco_SSE0;
-        __m128 wco_SSE1;
-        __m128 wco_SSE2;
-        __m128 wco_SSE3;
-        __m128 wco_any_SSE01, wco_any_SSE23, wco_any_SSE;
+        gmx_simd4_pb wco_S0;
+        gmx_simd4_pb wco_S1;
+        gmx_simd4_pb wco_S2;
+        gmx_simd4_pb wco_S3;
+        gmx_simd4_pb wco_any_S01, wco_any_S23, wco_any_S;
 
-        jx0_SSE = _mm_load1_ps(x_j+j0*stride+0);
-        jy0_SSE = _mm_load1_ps(x_j+j0*stride+1);
-        jz0_SSE = _mm_load1_ps(x_j+j0*stride+2);
+        jx0_S = gmx_simd4_set1_pr(x_j[j0*stride+0]);
+        jy0_S = gmx_simd4_set1_pr(x_j[j0*stride+1]);
+        jz0_S = gmx_simd4_set1_pr(x_j[j0*stride+2]);
 
-        jx1_SSE = _mm_load1_ps(x_j+j1*stride+0);
-        jy1_SSE = _mm_load1_ps(x_j+j1*stride+1);
-        jz1_SSE = _mm_load1_ps(x_j+j1*stride+2);
+        jx1_S = gmx_simd4_set1_pr(x_j[j1*stride+0]);
+        jy1_S = gmx_simd4_set1_pr(x_j[j1*stride+1]);
+        jz1_S = gmx_simd4_set1_pr(x_j[j1*stride+2]);
 
         /* Calculate distance */
-        dx_SSE0            = _mm_sub_ps(ix_SSE0, jx0_SSE);
-        dy_SSE0            = _mm_sub_ps(iy_SSE0, jy0_SSE);
-        dz_SSE0            = _mm_sub_ps(iz_SSE0, jz0_SSE);
-        dx_SSE1            = _mm_sub_ps(ix_SSE1, jx0_SSE);
-        dy_SSE1            = _mm_sub_ps(iy_SSE1, jy0_SSE);
-        dz_SSE1            = _mm_sub_ps(iz_SSE1, jz0_SSE);
-        dx_SSE2            = _mm_sub_ps(ix_SSE0, jx1_SSE);
-        dy_SSE2            = _mm_sub_ps(iy_SSE0, jy1_SSE);
-        dz_SSE2            = _mm_sub_ps(iz_SSE0, jz1_SSE);
-        dx_SSE3            = _mm_sub_ps(ix_SSE1, jx1_SSE);
-        dy_SSE3            = _mm_sub_ps(iy_SSE1, jy1_SSE);
-        dz_SSE3            = _mm_sub_ps(iz_SSE1, jz1_SSE);
+        dx_S0            = gmx_simd4_sub_pr(ix_S0, jx0_S);
+        dy_S0            = gmx_simd4_sub_pr(iy_S0, jy0_S);
+        dz_S0            = gmx_simd4_sub_pr(iz_S0, jz0_S);
+        dx_S1            = gmx_simd4_sub_pr(ix_S1, jx0_S);
+        dy_S1            = gmx_simd4_sub_pr(iy_S1, jy0_S);
+        dz_S1            = gmx_simd4_sub_pr(iz_S1, jz0_S);
+        dx_S2            = gmx_simd4_sub_pr(ix_S0, jx1_S);
+        dy_S2            = gmx_simd4_sub_pr(iy_S0, jy1_S);
+        dz_S2            = gmx_simd4_sub_pr(iz_S0, jz1_S);
+        dx_S3            = gmx_simd4_sub_pr(ix_S1, jx1_S);
+        dy_S3            = gmx_simd4_sub_pr(iy_S1, jy1_S);
+        dz_S3            = gmx_simd4_sub_pr(iz_S1, jz1_S);
 
         /* rsq = dx*dx+dy*dy+dz*dz */
-        rsq_SSE0           = gmx_mm_calc_rsq_ps(dx_SSE0, dy_SSE0, dz_SSE0);
-        rsq_SSE1           = gmx_mm_calc_rsq_ps(dx_SSE1, dy_SSE1, dz_SSE1);
-        rsq_SSE2           = gmx_mm_calc_rsq_ps(dx_SSE2, dy_SSE2, dz_SSE2);
-        rsq_SSE3           = gmx_mm_calc_rsq_ps(dx_SSE3, dy_SSE3, dz_SSE3);
+        rsq_S0           = gmx_simd4_calc_rsq_pr(dx_S0, dy_S0, dz_S0);
+        rsq_S1           = gmx_simd4_calc_rsq_pr(dx_S1, dy_S1, dz_S1);
+        rsq_S2           = gmx_simd4_calc_rsq_pr(dx_S2, dy_S2, dz_S2);
+        rsq_S3           = gmx_simd4_calc_rsq_pr(dx_S3, dy_S3, dz_S3);
 
-        wco_SSE0           = _mm_cmplt_ps(rsq_SSE0, rc2_SSE);
-        wco_SSE1           = _mm_cmplt_ps(rsq_SSE1, rc2_SSE);
-        wco_SSE2           = _mm_cmplt_ps(rsq_SSE2, rc2_SSE);
-        wco_SSE3           = _mm_cmplt_ps(rsq_SSE3, rc2_SSE);
+        wco_S0           = gmx_simd4_cmplt_pr(rsq_S0, rc2_S);
+        wco_S1           = gmx_simd4_cmplt_pr(rsq_S1, rc2_S);
+        wco_S2           = gmx_simd4_cmplt_pr(rsq_S2, rc2_S);
+        wco_S3           = gmx_simd4_cmplt_pr(rsq_S3, rc2_S);
 
-        wco_any_SSE01      = _mm_or_ps(wco_SSE0, wco_SSE1);
-        wco_any_SSE23      = _mm_or_ps(wco_SSE2, wco_SSE3);
-        wco_any_SSE        = _mm_or_ps(wco_any_SSE01, wco_any_SSE23);
+        wco_any_S01      = gmx_simd4_or_pb(wco_S0, wco_S1);
+        wco_any_S23      = gmx_simd4_or_pb(wco_S2, wco_S3);
+        wco_any_S        = gmx_simd4_or_pb(wco_any_S01, wco_any_S23);
 
-        if (_mm_movemask_ps(wco_any_SSE))
+        if (gmx_simd4_anytrue_pb(wco_any_S))
         {
             return TRUE;
         }
@@ -2339,8 +2318,8 @@ static gmx_bool subc_in_range_sse8(int na_c,
     return FALSE;
 
 #else
-    /* No SSE */
-    gmx_incons("SSE function called without SSE support");
+    /* No SIMD4 */
+    gmx_incons("SIMD4 function called without 4-wide SIMD support");
 
     return TRUE;
 #endif
@@ -2493,22 +2472,22 @@ static void nbnxn_init_pairlist(nbnxn_pairlist_t *nbl,
     snew(nbl->work, 1);
     if (nbl->bSimple)
     {
-        snew_aligned(nbl->work->bb_ci, 1, NBNXN_MEM_ALIGN);
+        snew_aligned(nbl->work->bb_ci, 1, NBNXN_SEARCH_BB_MEM_ALIGN);
     }
     else
     {
 #ifdef NBNXN_BBXXXX
-        snew_aligned(nbl->work->pbb_ci, GPU_NSUBCELL/STRIDE_PBB*NNBSBB_XXXX, NBNXN_MEM_ALIGN);
+        snew_aligned(nbl->work->pbb_ci, GPU_NSUBCELL/STRIDE_PBB*NNBSBB_XXXX, NBNXN_SEARCH_BB_MEM_ALIGN);
 #else
-        snew_aligned(nbl->work->bb_ci, GPU_NSUBCELL, NBNXN_MEM_ALIGN);
+        snew_aligned(nbl->work->bb_ci, GPU_NSUBCELL, NBNXN_SEARCH_BB_MEM_ALIGN);
 #endif
     }
-    snew_aligned(nbl->work->x_ci, NBNXN_NA_SC_MAX*DIM, NBNXN_MEM_ALIGN);
+    snew_aligned(nbl->work->x_ci, NBNXN_NA_SC_MAX*DIM, NBNXN_SEARCH_BB_MEM_ALIGN);
 #ifdef GMX_NBNXN_SIMD
     snew_aligned(nbl->work->x_ci_simd_4xn, 1, NBNXN_MEM_ALIGN);
     snew_aligned(nbl->work->x_ci_simd_2xnn, 1, NBNXN_MEM_ALIGN);
 #endif
-    snew_aligned(nbl->work->d2, GPU_NSUBCELL, NBNXN_MEM_ALIGN);
+    snew_aligned(nbl->work->d2, GPU_NSUBCELL, NBNXN_SEARCH_BB_MEM_ALIGN);
 
     nbl->work->sort            = NULL;
     nbl->work->sort_nalloc     = 0;
@@ -2901,7 +2880,7 @@ static void make_cluster_list_simple(const nbnxn_grid_t *gridj,
 #include "nbnxn_search_simd_2xnn.h"
 #endif
 
-/* Plain C or SSE code for making a pair list of super-cell sci vs scj.
+/* Plain C or SIMD4 code for making a pair list of super-cell sci vs scj.
  * Checks bounding box distances and possibly atom pair distances.
  */
 static void make_cluster_list_supersub(const nbnxn_search_t nbs,
@@ -2968,8 +2947,8 @@ static void make_cluster_list_supersub(const nbnxn_search_t nbs,
         }
 
 #ifdef NBNXN_BBXXXX
-        /* Determine all ci1 bb distances in one call with SSE */
-        subc_bb_dist2_sse_xxxx(gridj->pbb+(cj>>STRIDE_PBB_2LOG)*NNBSBB_XXXX+(cj & (STRIDE_PBB-1)),
+        /* Determine all ci1 bb distances in one call with SIMD4 */
+        subc_bb_dist2_simd4_xxxx(gridj->pbb+(cj>>STRIDE_PBB_2LOG)*NNBSBB_XXXX+(cj & (STRIDE_PBB-1)),
                                ci1, pbb_ci, d2l);
         *ndistc += na_c*2;
 #endif
@@ -2999,8 +2978,8 @@ static void make_cluster_list_supersub(const nbnxn_search_t nbs,
             *ndistc += na_c*na_c;
             if (d2 < rbb2 ||
                 (d2 < rl2 &&
-#ifdef NBNXN_PBB_SSE
-                 subc_in_range_sse8
+#ifdef NBNXN_PBB_SIMD4
+                 subc_in_range_simd4
 #else
                  subc_in_range_x
 #endif
@@ -3031,8 +3010,8 @@ static void make_cluster_list_supersub(const nbnxn_search_t nbs,
         {
             /* Avoid using function pointers here, as it's slower */
             if (
-#ifdef NBNXN_PBB_SSE
-                !subc_in_range_sse8
+#ifdef NBNXN_PBB_SIMD4
+                !subc_in_range_simd4
 #else
                 !subc_in_range_x
 #endif
@@ -3128,7 +3107,7 @@ static void set_ci_top_excls(const nbnxn_search_t nbs,
             ndirect++;
         }
     }
-#ifdef NBNXN_SEARCH_BB_SSE
+#ifdef NBNXN_SEARCH_BB_SIMD4
     else
     {
         while (cj_ind_first + ndirect <= cj_ind_last &&
@@ -3734,13 +3713,13 @@ static void icell_set_x_supersub(int ci,
     }
 }
 
-#ifdef NBNXN_SEARCH_BB_SSE
+#ifdef NBNXN_SEARCH_BB_SIMD4
 /* Copies PBC shifted super-cell packed atom coordinates to working array */
-static void icell_set_x_supersub_sse8(int ci,
-                                      real shx, real shy, real shz,
-                                      int na_c,
-                                      int stride, const real *x,
-                                      nbnxn_list_work_t *work)
+static void icell_set_x_supersub_simd4(int ci,
+                                       real shx, real shy, real shz,
+                                       int na_c,
+                                       int stride, const real *x,
+                                       nbnxn_list_work_t *work)
 {
     int  si, io, ia, i, j;
     real *x_ci;
@@ -4989,8 +4968,8 @@ void nbnxn_make_pairlist(const nbnxn_search_t  nbs,
     }
     else
     {
-#ifdef NBNXN_SEARCH_BB_SSE
-        nbs->icell_set_x = icell_set_x_supersub_sse8;
+#ifdef NBNXN_SEARCH_BB_SIMD4
+        nbs->icell_set_x = icell_set_x_supersub_simd4;
 #else
         nbs->icell_set_x = icell_set_x_supersub;
 #endif
diff --git a/src/mdlib/nbnxn_search_simd_2xnn.h b/src/mdlib/nbnxn_search_simd_2xnn.h
index 3c649df2d5..f611791397 100644
--- a/src/mdlib/nbnxn_search_simd_2xnn.h
+++ b/src/mdlib/nbnxn_search_simd_2xnn.h
@@ -86,40 +86,14 @@ icell_set_x_simd_2xnn(int ci,
 
     ia = X_IND_CI_SIMD_2XNN(ci);
 
-    x_ci->ix_SSE0 = gmx_set_2real_shift_pr(x + ia + 0*STRIDE_S + 0, shx);
-    x_ci->iy_SSE0 = gmx_set_2real_shift_pr(x + ia + 1*STRIDE_S + 0, shy);
-    x_ci->iz_SSE0 = gmx_set_2real_shift_pr(x + ia + 2*STRIDE_S + 0, shz);
-    x_ci->ix_SSE2 = gmx_set_2real_shift_pr(x + ia + 0*STRIDE_S + 2, shx);
-    x_ci->iy_SSE2 = gmx_set_2real_shift_pr(x + ia + 1*STRIDE_S + 2, shy);
-    x_ci->iz_SSE2 = gmx_set_2real_shift_pr(x + ia + 2*STRIDE_S + 2, shz);
+    x_ci->ix_S0 = gmx_set_2real_shift_pr(x + ia + 0*STRIDE_S + 0, shx);
+    x_ci->iy_S0 = gmx_set_2real_shift_pr(x + ia + 1*STRIDE_S + 0, shy);
+    x_ci->iz_S0 = gmx_set_2real_shift_pr(x + ia + 2*STRIDE_S + 0, shz);
+    x_ci->ix_S2 = gmx_set_2real_shift_pr(x + ia + 0*STRIDE_S + 2, shx);
+    x_ci->iy_S2 = gmx_set_2real_shift_pr(x + ia + 1*STRIDE_S + 2, shy);
+    x_ci->iz_S2 = gmx_set_2real_shift_pr(x + ia + 2*STRIDE_S + 2, shz);
 }
 
-#ifndef GMX_SIMD_HAVE_ANYTRUE
-/* Fallback function in case gmx_anytrue_pr is not present */
-static gmx_inline gmx_bool
-gmx_anytrue_2xn_pb(gmx_mm_pb bool_S)
-{
-    real     bools_array[2*GMX_SIMD_WIDTH_HERE], *bools;
-    gmx_bool any;
-    int      s;
-
-    bools = gmx_simd_align_real(bools_array);
-
-    gmx_store_pb(bools, bool_S);
-
-    any = FALSE;
-    for (s = 0; s < GMX_SIMD_WIDTH_HERE; s++)
-    {
-        if (GMX_SIMD_IS_TRUE(s))
-        {
-            any = TRUE;
-        }
-    }
-
-    return any;
-}
-#endif
-
 /* SIMD code for making a pair list of cell ci vs cell cjf-cjl
  * for coordinates in packed format.
  * Checks bouding box distances and possibly atom pair distances.
@@ -137,19 +111,19 @@ make_cluster_list_simd_2xnn(const nbnxn_grid_t *gridj,
     const nbnxn_x_ci_simd_2xnn_t *work;
     const nbnxn_bb_t             *bb_ci;
 
-    gmx_mm_pr                     jx_SSE, jy_SSE, jz_SSE;
+    gmx_mm_pr                     jx_S, jy_S, jz_S;
 
-    gmx_mm_pr                     dx_SSE0, dy_SSE0, dz_SSE0;
-    gmx_mm_pr                     dx_SSE2, dy_SSE2, dz_SSE2;
+    gmx_mm_pr                     dx_S0, dy_S0, dz_S0;
+    gmx_mm_pr                     dx_S2, dy_S2, dz_S2;
 
-    gmx_mm_pr                     rsq_SSE0;
-    gmx_mm_pr                     rsq_SSE2;
+    gmx_mm_pr                     rsq_S0;
+    gmx_mm_pr                     rsq_S2;
 
-    gmx_mm_pb                     wco_SSE0;
-    gmx_mm_pb                     wco_SSE2;
-    gmx_mm_pb                     wco_any_SSE;
+    gmx_mm_pb                     wco_S0;
+    gmx_mm_pb                     wco_S2;
+    gmx_mm_pb                     wco_any_S;
 
-    gmx_mm_pr                     rc2_SSE;
+    gmx_mm_pr                     rc2_S;
 
     gmx_bool                      InRange;
     float                         d2;
@@ -162,13 +136,13 @@ make_cluster_list_simd_2xnn(const nbnxn_grid_t *gridj,
 
     bb_ci = nbl->work->bb_ci;
 
-    rc2_SSE   = gmx_set1_pr(rl2);
+    rc2_S   = gmx_set1_pr(rl2);
 
     InRange = FALSE;
     while (!InRange && cjf <= cjl)
     {
-#ifdef NBNXN_SEARCH_BB_SSE
-        d2 = subc_bb_dist2_sse(0, bb_ci, cjf, gridj->bbj);
+#ifdef NBNXN_SEARCH_BB_SIMD4
+        d2 = subc_bb_dist2_simd4(0, bb_ci, cjf, gridj->bbj);
 #else
         d2 = subc_bb_dist2(0, bb_ci, cjf, gridj->bbj);
 #endif
@@ -187,32 +161,28 @@ make_cluster_list_simd_2xnn(const nbnxn_grid_t *gridj,
         {
             xind_f  = X_IND_CJ_SIMD_2XNN(CI_TO_CJ_SIMD_2XNN(gridj->cell0) + cjf);
 
-            jx_SSE  = gmx_load_hpr_hilo_pr(x_j+xind_f+0*STRIDE_S);
-            jy_SSE  = gmx_load_hpr_hilo_pr(x_j+xind_f+1*STRIDE_S);
-            jz_SSE  = gmx_load_hpr_hilo_pr(x_j+xind_f+2*STRIDE_S);
+            jx_S  = gmx_load_hpr_hilo_pr(x_j+xind_f+0*STRIDE_S);
+            jy_S  = gmx_load_hpr_hilo_pr(x_j+xind_f+1*STRIDE_S);
+            jz_S  = gmx_load_hpr_hilo_pr(x_j+xind_f+2*STRIDE_S);
 
             /* Calculate distance */
-            dx_SSE0            = gmx_sub_pr(work->ix_SSE0, jx_SSE);
-            dy_SSE0            = gmx_sub_pr(work->iy_SSE0, jy_SSE);
-            dz_SSE0            = gmx_sub_pr(work->iz_SSE0, jz_SSE);
-            dx_SSE2            = gmx_sub_pr(work->ix_SSE2, jx_SSE);
-            dy_SSE2            = gmx_sub_pr(work->iy_SSE2, jy_SSE);
-            dz_SSE2            = gmx_sub_pr(work->iz_SSE2, jz_SSE);
+            dx_S0            = gmx_sub_pr(work->ix_S0, jx_S);
+            dy_S0            = gmx_sub_pr(work->iy_S0, jy_S);
+            dz_S0            = gmx_sub_pr(work->iz_S0, jz_S);
+            dx_S2            = gmx_sub_pr(work->ix_S2, jx_S);
+            dy_S2            = gmx_sub_pr(work->iy_S2, jy_S);
+            dz_S2            = gmx_sub_pr(work->iz_S2, jz_S);
 
             /* rsq = dx*dx+dy*dy+dz*dz */
-            rsq_SSE0           = gmx_calc_rsq_pr(dx_SSE0, dy_SSE0, dz_SSE0);
-            rsq_SSE2           = gmx_calc_rsq_pr(dx_SSE2, dy_SSE2, dz_SSE2);
+            rsq_S0           = gmx_calc_rsq_pr(dx_S0, dy_S0, dz_S0);
+            rsq_S2           = gmx_calc_rsq_pr(dx_S2, dy_S2, dz_S2);
 
-            wco_SSE0           = gmx_cmplt_pr(rsq_SSE0, rc2_SSE);
-            wco_SSE2           = gmx_cmplt_pr(rsq_SSE2, rc2_SSE);
+            wco_S0           = gmx_cmplt_pr(rsq_S0, rc2_S);
+            wco_S2           = gmx_cmplt_pr(rsq_S2, rc2_S);
 
-            wco_any_SSE        = gmx_or_pb(wco_SSE0, wco_SSE2);
+            wco_any_S        = gmx_or_pb(wco_S0, wco_S2);
 
-#ifdef GMX_SIMD_HAVE_ANYTRUE
-            InRange            = gmx_anytrue_pb(wco_any_SSE);
-#else
-            InRange            = gmx_anytrue_2xn_pb(wco_any_SSE);
-#endif
+            InRange          = gmx_anytrue_pb(wco_any_S);
 
             *ndistc += 2*GMX_SIMD_WIDTH_HERE;
         }
@@ -229,8 +199,8 @@ make_cluster_list_simd_2xnn(const nbnxn_grid_t *gridj,
     InRange = FALSE;
     while (!InRange && cjl > cjf)
     {
-#ifdef NBNXN_SEARCH_BB_SSE
-        d2 = subc_bb_dist2_sse(0, bb_ci, cjl, gridj->bbj);
+#ifdef NBNXN_SEARCH_BB_SIMD4
+        d2 = subc_bb_dist2_simd4(0, bb_ci, cjl, gridj->bbj);
 #else
         d2 = subc_bb_dist2(0, bb_ci, cjl, gridj->bbj);
 #endif
@@ -249,32 +219,28 @@ make_cluster_list_simd_2xnn(const nbnxn_grid_t *gridj,
         {
             xind_l  = X_IND_CJ_SIMD_2XNN(CI_TO_CJ_SIMD_2XNN(gridj->cell0) + cjl);
 
-            jx_SSE  = gmx_load_hpr_hilo_pr(x_j+xind_l+0*STRIDE_S);
-            jy_SSE  = gmx_load_hpr_hilo_pr(x_j+xind_l+1*STRIDE_S);
-            jz_SSE  = gmx_load_hpr_hilo_pr(x_j+xind_l+2*STRIDE_S);
+            jx_S  = gmx_load_hpr_hilo_pr(x_j+xind_l+0*STRIDE_S);
+            jy_S  = gmx_load_hpr_hilo_pr(x_j+xind_l+1*STRIDE_S);
+            jz_S  = gmx_load_hpr_hilo_pr(x_j+xind_l+2*STRIDE_S);
 
             /* Calculate distance */
-            dx_SSE0            = gmx_sub_pr(work->ix_SSE0, jx_SSE);
-            dy_SSE0            = gmx_sub_pr(work->iy_SSE0, jy_SSE);
-            dz_SSE0            = gmx_sub_pr(work->iz_SSE0, jz_SSE);
-            dx_SSE2            = gmx_sub_pr(work->ix_SSE2, jx_SSE);
-            dy_SSE2            = gmx_sub_pr(work->iy_SSE2, jy_SSE);
-            dz_SSE2            = gmx_sub_pr(work->iz_SSE2, jz_SSE);
+            dx_S0            = gmx_sub_pr(work->ix_S0, jx_S);
+            dy_S0            = gmx_sub_pr(work->iy_S0, jy_S);
+            dz_S0            = gmx_sub_pr(work->iz_S0, jz_S);
+            dx_S2            = gmx_sub_pr(work->ix_S2, jx_S);
+            dy_S2            = gmx_sub_pr(work->iy_S2, jy_S);
+            dz_S2            = gmx_sub_pr(work->iz_S2, jz_S);
 
             /* rsq = dx*dx+dy*dy+dz*dz */
-            rsq_SSE0           = gmx_calc_rsq_pr(dx_SSE0, dy_SSE0, dz_SSE0);
-            rsq_SSE2           = gmx_calc_rsq_pr(dx_SSE2, dy_SSE2, dz_SSE2);
+            rsq_S0           = gmx_calc_rsq_pr(dx_S0, dy_S0, dz_S0);
+            rsq_S2           = gmx_calc_rsq_pr(dx_S2, dy_S2, dz_S2);
 
-            wco_SSE0           = gmx_cmplt_pr(rsq_SSE0, rc2_SSE);
-            wco_SSE2           = gmx_cmplt_pr(rsq_SSE2, rc2_SSE);
+            wco_S0           = gmx_cmplt_pr(rsq_S0, rc2_S);
+            wco_S2           = gmx_cmplt_pr(rsq_S2, rc2_S);
 
-            wco_any_SSE        = gmx_or_pb(wco_SSE0, wco_SSE2);
+            wco_any_S        = gmx_or_pb(wco_S0, wco_S2);
 
-#ifdef GMX_SIMD_HAVE_ANYTRUE
-            InRange            = gmx_anytrue_pb(wco_any_SSE);
-#else
-            InRange            = gmx_anytrue_2xn_pb(wco_any_SSE);
-#endif
+            InRange          = gmx_anytrue_pb(wco_any_S);
 
             *ndistc += 2*GMX_SIMD_WIDTH_HERE;
         }
diff --git a/src/mdlib/nbnxn_search_simd_4xn.h b/src/mdlib/nbnxn_search_simd_4xn.h
index a7a19579bb..9d5c638c43 100644
--- a/src/mdlib/nbnxn_search_simd_4xn.h
+++ b/src/mdlib/nbnxn_search_simd_4xn.h
@@ -58,46 +58,20 @@ icell_set_x_simd_4xn(int ci,
 
     ia = X_IND_CI_SIMD_4XN(ci);
 
-    x_ci->ix_SSE0 = gmx_set1_pr(x[ia + 0*STRIDE_S    ] + shx);
-    x_ci->iy_SSE0 = gmx_set1_pr(x[ia + 1*STRIDE_S    ] + shy);
-    x_ci->iz_SSE0 = gmx_set1_pr(x[ia + 2*STRIDE_S    ] + shz);
-    x_ci->ix_SSE1 = gmx_set1_pr(x[ia + 0*STRIDE_S + 1] + shx);
-    x_ci->iy_SSE1 = gmx_set1_pr(x[ia + 1*STRIDE_S + 1] + shy);
-    x_ci->iz_SSE1 = gmx_set1_pr(x[ia + 2*STRIDE_S + 1] + shz);
-    x_ci->ix_SSE2 = gmx_set1_pr(x[ia + 0*STRIDE_S + 2] + shx);
-    x_ci->iy_SSE2 = gmx_set1_pr(x[ia + 1*STRIDE_S + 2] + shy);
-    x_ci->iz_SSE2 = gmx_set1_pr(x[ia + 2*STRIDE_S + 2] + shz);
-    x_ci->ix_SSE3 = gmx_set1_pr(x[ia + 0*STRIDE_S + 3] + shx);
-    x_ci->iy_SSE3 = gmx_set1_pr(x[ia + 1*STRIDE_S + 3] + shy);
-    x_ci->iz_SSE3 = gmx_set1_pr(x[ia + 2*STRIDE_S + 3] + shz);
+    x_ci->ix_S0 = gmx_set1_pr(x[ia + 0*STRIDE_S    ] + shx);
+    x_ci->iy_S0 = gmx_set1_pr(x[ia + 1*STRIDE_S    ] + shy);
+    x_ci->iz_S0 = gmx_set1_pr(x[ia + 2*STRIDE_S    ] + shz);
+    x_ci->ix_S1 = gmx_set1_pr(x[ia + 0*STRIDE_S + 1] + shx);
+    x_ci->iy_S1 = gmx_set1_pr(x[ia + 1*STRIDE_S + 1] + shy);
+    x_ci->iz_S1 = gmx_set1_pr(x[ia + 2*STRIDE_S + 1] + shz);
+    x_ci->ix_S2 = gmx_set1_pr(x[ia + 0*STRIDE_S + 2] + shx);
+    x_ci->iy_S2 = gmx_set1_pr(x[ia + 1*STRIDE_S + 2] + shy);
+    x_ci->iz_S2 = gmx_set1_pr(x[ia + 2*STRIDE_S + 2] + shz);
+    x_ci->ix_S3 = gmx_set1_pr(x[ia + 0*STRIDE_S + 3] + shx);
+    x_ci->iy_S3 = gmx_set1_pr(x[ia + 1*STRIDE_S + 3] + shy);
+    x_ci->iz_S3 = gmx_set1_pr(x[ia + 2*STRIDE_S + 3] + shz);
 }
 
-#ifndef GMX_SIMD_HAVE_ANYTRUE
-/* Fallback function in case gmx_anytrue_pr is not present */
-static gmx_inline gmx_bool
-gmx_anytrue_4xn_pb(gmx_mm_pb bool_S)
-{
-    real     bools_array[2*GMX_SIMD_WIDTH_HERE], *bools;
-    gmx_bool any;
-    int      s;
-
-    bools = gmx_simd_align_real(bools_array);
-
-    gmx_store_pb(bools, bool_S);
-
-    any = FALSE;
-    for (s = 0; s < GMX_SIMD_WIDTH_HERE; s++)
-    {
-        if (GMX_SIMD_IS_TRUE(bools[s]))
-        {
-            any = TRUE;
-        }
-    }
-
-    return any;
-}
-#endif
-
 /* SIMD code for making a pair list of cell ci vs cell cjf-cjl
  * for coordinates in packed format.
  * Checks bouding box distances and possibly atom pair distances.
@@ -115,25 +89,25 @@ make_cluster_list_simd_4xn(const nbnxn_grid_t *gridj,
     const nbnxn_x_ci_simd_4xn_t *work;
     const nbnxn_bb_t            *bb_ci;
 
-    gmx_mm_pr                    jx_SSE, jy_SSE, jz_SSE;
+    gmx_mm_pr                    jx_S, jy_S, jz_S;
 
-    gmx_mm_pr                    dx_SSE0, dy_SSE0, dz_SSE0;
-    gmx_mm_pr                    dx_SSE1, dy_SSE1, dz_SSE1;
-    gmx_mm_pr                    dx_SSE2, dy_SSE2, dz_SSE2;
-    gmx_mm_pr                    dx_SSE3, dy_SSE3, dz_SSE3;
+    gmx_mm_pr                    dx_S0, dy_S0, dz_S0;
+    gmx_mm_pr                    dx_S1, dy_S1, dz_S1;
+    gmx_mm_pr                    dx_S2, dy_S2, dz_S2;
+    gmx_mm_pr                    dx_S3, dy_S3, dz_S3;
 
-    gmx_mm_pr                    rsq_SSE0;
-    gmx_mm_pr                    rsq_SSE1;
-    gmx_mm_pr                    rsq_SSE2;
-    gmx_mm_pr                    rsq_SSE3;
+    gmx_mm_pr                    rsq_S0;
+    gmx_mm_pr                    rsq_S1;
+    gmx_mm_pr                    rsq_S2;
+    gmx_mm_pr                    rsq_S3;
 
-    gmx_mm_pb                    wco_SSE0;
-    gmx_mm_pb                    wco_SSE1;
-    gmx_mm_pb                    wco_SSE2;
-    gmx_mm_pb                    wco_SSE3;
-    gmx_mm_pb                    wco_any_SSE01, wco_any_SSE23, wco_any_SSE;
+    gmx_mm_pb                    wco_S0;
+    gmx_mm_pb                    wco_S1;
+    gmx_mm_pb                    wco_S2;
+    gmx_mm_pb                    wco_S3;
+    gmx_mm_pb                    wco_any_S01, wco_any_S23, wco_any_S;
 
-    gmx_mm_pr                    rc2_SSE;
+    gmx_mm_pr                    rc2_S;
 
     gmx_bool                     InRange;
     float                        d2;
@@ -146,13 +120,13 @@ make_cluster_list_simd_4xn(const nbnxn_grid_t *gridj,
 
     bb_ci = nbl->work->bb_ci;
 
-    rc2_SSE   = gmx_set1_pr(rl2);
+    rc2_S   = gmx_set1_pr(rl2);
 
     InRange = FALSE;
     while (!InRange && cjf <= cjl)
     {
-#ifdef NBNXN_SEARCH_BB_SSE
-        d2 = subc_bb_dist2_sse(0, bb_ci, cjf, gridj->bbj);
+#ifdef NBNXN_SEARCH_BB_SIMD4
+        d2 = subc_bb_dist2_simd4(0, bb_ci, cjf, gridj->bbj);
 #else
         d2 = subc_bb_dist2(0, bb_ci, cjf, gridj->bbj);
 #endif
@@ -171,45 +145,41 @@ make_cluster_list_simd_4xn(const nbnxn_grid_t *gridj,
         {
             xind_f  = X_IND_CJ_SIMD_4XN(CI_TO_CJ_SIMD_4XN(gridj->cell0) + cjf);
 
-            jx_SSE  = gmx_load_pr(x_j+xind_f+0*STRIDE_S);
-            jy_SSE  = gmx_load_pr(x_j+xind_f+1*STRIDE_S);
-            jz_SSE  = gmx_load_pr(x_j+xind_f+2*STRIDE_S);
+            jx_S  = gmx_load_pr(x_j+xind_f+0*STRIDE_S);
+            jy_S  = gmx_load_pr(x_j+xind_f+1*STRIDE_S);
+            jz_S  = gmx_load_pr(x_j+xind_f+2*STRIDE_S);
 
 
             /* Calculate distance */
-            dx_SSE0            = gmx_sub_pr(work->ix_SSE0, jx_SSE);
-            dy_SSE0            = gmx_sub_pr(work->iy_SSE0, jy_SSE);
-            dz_SSE0            = gmx_sub_pr(work->iz_SSE0, jz_SSE);
-            dx_SSE1            = gmx_sub_pr(work->ix_SSE1, jx_SSE);
-            dy_SSE1            = gmx_sub_pr(work->iy_SSE1, jy_SSE);
-            dz_SSE1            = gmx_sub_pr(work->iz_SSE1, jz_SSE);
-            dx_SSE2            = gmx_sub_pr(work->ix_SSE2, jx_SSE);
-            dy_SSE2            = gmx_sub_pr(work->iy_SSE2, jy_SSE);
-            dz_SSE2            = gmx_sub_pr(work->iz_SSE2, jz_SSE);
-            dx_SSE3            = gmx_sub_pr(work->ix_SSE3, jx_SSE);
-            dy_SSE3            = gmx_sub_pr(work->iy_SSE3, jy_SSE);
-            dz_SSE3            = gmx_sub_pr(work->iz_SSE3, jz_SSE);
+            dx_S0            = gmx_sub_pr(work->ix_S0, jx_S);
+            dy_S0            = gmx_sub_pr(work->iy_S0, jy_S);
+            dz_S0            = gmx_sub_pr(work->iz_S0, jz_S);
+            dx_S1            = gmx_sub_pr(work->ix_S1, jx_S);
+            dy_S1            = gmx_sub_pr(work->iy_S1, jy_S);
+            dz_S1            = gmx_sub_pr(work->iz_S1, jz_S);
+            dx_S2            = gmx_sub_pr(work->ix_S2, jx_S);
+            dy_S2            = gmx_sub_pr(work->iy_S2, jy_S);
+            dz_S2            = gmx_sub_pr(work->iz_S2, jz_S);
+            dx_S3            = gmx_sub_pr(work->ix_S3, jx_S);
+            dy_S3            = gmx_sub_pr(work->iy_S3, jy_S);
+            dz_S3            = gmx_sub_pr(work->iz_S3, jz_S);
 
             /* rsq = dx*dx+dy*dy+dz*dz */
-            rsq_SSE0           = gmx_calc_rsq_pr(dx_SSE0, dy_SSE0, dz_SSE0);
-            rsq_SSE1           = gmx_calc_rsq_pr(dx_SSE1, dy_SSE1, dz_SSE1);
-            rsq_SSE2           = gmx_calc_rsq_pr(dx_SSE2, dy_SSE2, dz_SSE2);
-            rsq_SSE3           = gmx_calc_rsq_pr(dx_SSE3, dy_SSE3, dz_SSE3);
-
-            wco_SSE0           = gmx_cmplt_pr(rsq_SSE0, rc2_SSE);
-            wco_SSE1           = gmx_cmplt_pr(rsq_SSE1, rc2_SSE);
-            wco_SSE2           = gmx_cmplt_pr(rsq_SSE2, rc2_SSE);
-            wco_SSE3           = gmx_cmplt_pr(rsq_SSE3, rc2_SSE);
-
-            wco_any_SSE01      = gmx_or_pb(wco_SSE0, wco_SSE1);
-            wco_any_SSE23      = gmx_or_pb(wco_SSE2, wco_SSE3);
-            wco_any_SSE        = gmx_or_pb(wco_any_SSE01, wco_any_SSE23);
-
-#ifdef GMX_SIMD_HAVE_ANYTRUE
-            InRange            = gmx_anytrue_pb(wco_any_SSE);
-#else
-            InRange            = gmx_anytrue_4xn_pb(wco_any_SSE);
-#endif
+            rsq_S0           = gmx_calc_rsq_pr(dx_S0, dy_S0, dz_S0);
+            rsq_S1           = gmx_calc_rsq_pr(dx_S1, dy_S1, dz_S1);
+            rsq_S2           = gmx_calc_rsq_pr(dx_S2, dy_S2, dz_S2);
+            rsq_S3           = gmx_calc_rsq_pr(dx_S3, dy_S3, dz_S3);
+
+            wco_S0           = gmx_cmplt_pr(rsq_S0, rc2_S);
+            wco_S1           = gmx_cmplt_pr(rsq_S1, rc2_S);
+            wco_S2           = gmx_cmplt_pr(rsq_S2, rc2_S);
+            wco_S3           = gmx_cmplt_pr(rsq_S3, rc2_S);
+
+            wco_any_S01      = gmx_or_pb(wco_S0, wco_S1);
+            wco_any_S23      = gmx_or_pb(wco_S2, wco_S3);
+            wco_any_S        = gmx_or_pb(wco_any_S01, wco_any_S23);
+
+            InRange          = gmx_anytrue_pb(wco_any_S);
 
             *ndistc += 4*GMX_SIMD_WIDTH_HERE;
         }
@@ -226,8 +196,8 @@ make_cluster_list_simd_4xn(const nbnxn_grid_t *gridj,
     InRange = FALSE;
     while (!InRange && cjl > cjf)
     {
-#ifdef NBNXN_SEARCH_BB_SSE
-        d2 = subc_bb_dist2_sse(0, bb_ci, cjl, gridj->bbj);
+#ifdef NBNXN_SEARCH_BB_SIMD4
+        d2 = subc_bb_dist2_simd4(0, bb_ci, cjl, gridj->bbj);
 #else
         d2 = subc_bb_dist2(0, bb_ci, cjl, gridj->bbj);
 #endif
@@ -246,44 +216,40 @@ make_cluster_list_simd_4xn(const nbnxn_grid_t *gridj,
         {
             xind_l  = X_IND_CJ_SIMD_4XN(CI_TO_CJ_SIMD_4XN(gridj->cell0) + cjl);
 
-            jx_SSE  = gmx_load_pr(x_j+xind_l+0*STRIDE_S);
-            jy_SSE  = gmx_load_pr(x_j+xind_l+1*STRIDE_S);
-            jz_SSE  = gmx_load_pr(x_j+xind_l+2*STRIDE_S);
+            jx_S  = gmx_load_pr(x_j+xind_l+0*STRIDE_S);
+            jy_S  = gmx_load_pr(x_j+xind_l+1*STRIDE_S);
+            jz_S  = gmx_load_pr(x_j+xind_l+2*STRIDE_S);
 
             /* Calculate distance */
-            dx_SSE0            = gmx_sub_pr(work->ix_SSE0, jx_SSE);
-            dy_SSE0            = gmx_sub_pr(work->iy_SSE0, jy_SSE);
-            dz_SSE0            = gmx_sub_pr(work->iz_SSE0, jz_SSE);
-            dx_SSE1            = gmx_sub_pr(work->ix_SSE1, jx_SSE);
-            dy_SSE1            = gmx_sub_pr(work->iy_SSE1, jy_SSE);
-            dz_SSE1            = gmx_sub_pr(work->iz_SSE1, jz_SSE);
-            dx_SSE2            = gmx_sub_pr(work->ix_SSE2, jx_SSE);
-            dy_SSE2            = gmx_sub_pr(work->iy_SSE2, jy_SSE);
-            dz_SSE2            = gmx_sub_pr(work->iz_SSE2, jz_SSE);
-            dx_SSE3            = gmx_sub_pr(work->ix_SSE3, jx_SSE);
-            dy_SSE3            = gmx_sub_pr(work->iy_SSE3, jy_SSE);
-            dz_SSE3            = gmx_sub_pr(work->iz_SSE3, jz_SSE);
+            dx_S0            = gmx_sub_pr(work->ix_S0, jx_S);
+            dy_S0            = gmx_sub_pr(work->iy_S0, jy_S);
+            dz_S0            = gmx_sub_pr(work->iz_S0, jz_S);
+            dx_S1            = gmx_sub_pr(work->ix_S1, jx_S);
+            dy_S1            = gmx_sub_pr(work->iy_S1, jy_S);
+            dz_S1            = gmx_sub_pr(work->iz_S1, jz_S);
+            dx_S2            = gmx_sub_pr(work->ix_S2, jx_S);
+            dy_S2            = gmx_sub_pr(work->iy_S2, jy_S);
+            dz_S2            = gmx_sub_pr(work->iz_S2, jz_S);
+            dx_S3            = gmx_sub_pr(work->ix_S3, jx_S);
+            dy_S3            = gmx_sub_pr(work->iy_S3, jy_S);
+            dz_S3            = gmx_sub_pr(work->iz_S3, jz_S);
 
             /* rsq = dx*dx+dy*dy+dz*dz */
-            rsq_SSE0           = gmx_calc_rsq_pr(dx_SSE0, dy_SSE0, dz_SSE0);
-            rsq_SSE1           = gmx_calc_rsq_pr(dx_SSE1, dy_SSE1, dz_SSE1);
-            rsq_SSE2           = gmx_calc_rsq_pr(dx_SSE2, dy_SSE2, dz_SSE2);
-            rsq_SSE3           = gmx_calc_rsq_pr(dx_SSE3, dy_SSE3, dz_SSE3);
-
-            wco_SSE0           = gmx_cmplt_pr(rsq_SSE0, rc2_SSE);
-            wco_SSE1           = gmx_cmplt_pr(rsq_SSE1, rc2_SSE);
-            wco_SSE2           = gmx_cmplt_pr(rsq_SSE2, rc2_SSE);
-            wco_SSE3           = gmx_cmplt_pr(rsq_SSE3, rc2_SSE);
-
-            wco_any_SSE01      = gmx_or_pb(wco_SSE0, wco_SSE1);
-            wco_any_SSE23      = gmx_or_pb(wco_SSE2, wco_SSE3);
-            wco_any_SSE        = gmx_or_pb(wco_any_SSE01, wco_any_SSE23);
-
-#ifdef GMX_SIMD_HAVE_ANYTRUE
-            InRange            = gmx_anytrue_pb(wco_any_SSE);
-#else
-            InRange            = gmx_anytrue_4xn_pb(wco_any_SSE);
-#endif
+            rsq_S0           = gmx_calc_rsq_pr(dx_S0, dy_S0, dz_S0);
+            rsq_S1           = gmx_calc_rsq_pr(dx_S1, dy_S1, dz_S1);
+            rsq_S2           = gmx_calc_rsq_pr(dx_S2, dy_S2, dz_S2);
+            rsq_S3           = gmx_calc_rsq_pr(dx_S3, dy_S3, dz_S3);
+
+            wco_S0           = gmx_cmplt_pr(rsq_S0, rc2_S);
+            wco_S1           = gmx_cmplt_pr(rsq_S1, rc2_S);
+            wco_S2           = gmx_cmplt_pr(rsq_S2, rc2_S);
+            wco_S3           = gmx_cmplt_pr(rsq_S3, rc2_S);
+
+            wco_any_S01      = gmx_or_pb(wco_S0, wco_S1);
+            wco_any_S23      = gmx_or_pb(wco_S2, wco_S3);
+            wco_any_S        = gmx_or_pb(wco_any_S01, wco_any_S23);
+
+            InRange          = gmx_anytrue_pb(wco_any_S);
 
             *ndistc += 4*GMX_SIMD_WIDTH_HERE;
         }
diff --git a/src/mdlib/pme.c b/src/mdlib/pme.c
index c641e46821..3316546344 100644
--- a/src/mdlib/pme.c
+++ b/src/mdlib/pme.c
@@ -95,23 +95,28 @@
 /* Include the SIMD macro file and then check for support */
 #include "gmx_simd_macros.h"
 #if defined GMX_HAVE_SIMD_MACROS && defined GMX_SIMD_HAVE_EXP
-/* Turn on SIMD intrinsics for PME solve */
+/* Turn on arbitrary width SIMD intrinsics for PME solve */
 #define PME_SIMD
 #endif
 
-/* SIMD spread+gather only in single precision with SSE2 or higher available.
- * We might want to switch to use gmx_simd_macros.h, but this is somewhat
- * complicated, as we use unaligned and/or 4-wide only loads.
+/* Include the 4-wide SIMD macro file */
+#include "gmx_simd4_macros.h"
+/* Check if we have 4-wide SIMD macro support */
+#ifdef GMX_HAVE_SIMD4_MACROS
+/* Do PME spread and gather with 4-wide SIMD.
+ * NOTE: SIMD is only used with PME order 4 and 5 (which are the most common).
  */
-#if defined(GMX_X86_SSE2) && !defined(GMX_DOUBLE)
-#define PME_SSE_SPREAD_GATHER
-#include <emmintrin.h>
-/* Some old AMD processors could have problems with unaligned loads+stores */
-#ifndef GMX_FAHCORE
-#define PME_SSE_UNALIGNED
+#define PME_SIMD4_SPREAD_GATHER
+
+#ifdef GMX_SIMD4_HAVE_UNALIGNED
+/* With PME-order=4 on x86, unaligned load+store is slightly faster
+ * than doubling all SIMD operations when using aligned load+store.
+ */
+#define PME_SIMD4_UNALIGNED
 #endif
 #endif
 
+
 #include "mpelogging.h"
 
 #define DFT_TOL 1e-7
@@ -128,7 +133,16 @@
 #define mpi_type MPI_FLOAT
 #endif
 
-/* GMX_CACHE_SEP should be a multiple of 16 to preserve alignment */
+#ifdef PME_SIMD4_SPREAD_GATHER
+#define SIMD4_ALIGNMENT  (GMX_SIMD4_WIDTH*sizeof(real))
+#else
+/* We can use any alignment, apart from 0, so we use 4 reals */
+#define SIMD4_ALIGNMENT  (4*sizeof(real))
+#endif
+
+/* GMX_CACHE_SEP should be a multiple of the SIMD and SIMD4 register size
+ * to preserve alignment.
+ */
 #define GMX_CACHE_SEP 64
 
 /* We only define a maximum to be able to use local arrays without allocation.
@@ -239,9 +253,9 @@ typedef struct {
 
 
 typedef struct {
-#ifdef PME_SSE_SPREAD_GATHER
-    /* Masks for SSE aligned spreading and gathering */
-    __m128 mask_SSE0[6], mask_SSE1[6];
+#ifdef PME_SIMD4_SPREAD_GATHER
+    /* Masks for 4-wide SIMD aligned spreading and gathering */
+    gmx_simd4_pb mask_S0[6], mask_S1[6];
 #else
     int    dummy; /* C89 requires that struct has at least one member */
 #endif
@@ -589,8 +603,9 @@ static void realloc_splinevec(splinevec th, real **ptr_z, int nalloc)
 
     srenew(th[XX], nalloc);
     srenew(th[YY], nalloc);
-    /* In z we add padding, this is only required for the aligned SSE code */
-    srenew(*ptr_z, nalloc+2*padding);
+    /* In z we add padding, this is only required for the aligned SIMD code */
+    sfree_aligned(*ptr_z);
+    snew_aligned(*ptr_z, nalloc+2*padding, SIMD4_ALIGNMENT);
     th[ZZ] = *ptr_z + padding;
 
     for (i = 0; i < padding; i++)
@@ -1514,6 +1529,12 @@ static void spread_q_bsplines_thread(pmegrid_t *pmegrid,
     int            pnx, pny, pnz, ndatatot;
     int            offx, offy, offz;
 
+#if defined PME_SIMD4_SPREAD_GATHER && !defined PME_SIMD4_UNALIGNED
+    real           thz_buffer[12], *thz_aligned;
+
+    thz_aligned = gmx_simd4_align_real(thz_buffer);
+#endif
+
     pnx = pmegrid->s[XX];
     pny = pmegrid->s[YY];
     pnz = pmegrid->s[ZZ];
@@ -1552,23 +1573,23 @@ static void spread_q_bsplines_thread(pmegrid_t *pmegrid,
             switch (order)
             {
                 case 4:
-#ifdef PME_SSE_SPREAD_GATHER
-#ifdef PME_SSE_UNALIGNED
-#define PME_SPREAD_SSE_ORDER4
+#ifdef PME_SIMD4_SPREAD_GATHER
+#ifdef PME_SIMD4_UNALIGNED
+#define PME_SPREAD_SIMD4_ORDER4
 #else
-#define PME_SPREAD_SSE_ALIGNED
+#define PME_SPREAD_SIMD4_ALIGNED
 #define PME_ORDER 4
 #endif
-#include "pme_sse_single.h"
+#include "pme_simd4.h"
 #else
                     DO_BSPLINE(4);
 #endif
                     break;
                 case 5:
-#ifdef PME_SSE_SPREAD_GATHER
-#define PME_SPREAD_SSE_ALIGNED
+#ifdef PME_SIMD4_SPREAD_GATHER
+#define PME_SPREAD_SIMD4_ALIGNED
 #define PME_ORDER 5
-#include "pme_sse_single.h"
+#include "pme_simd4.h"
 #else
                     DO_BSPLINE(5);
 #endif
@@ -1583,9 +1604,9 @@ static void spread_q_bsplines_thread(pmegrid_t *pmegrid,
 
 static void set_grid_alignment(int *pmegrid_nz, int pme_order)
 {
-#ifdef PME_SSE_SPREAD_GATHER
+#ifdef PME_SIMD4_SPREAD_GATHER
     if (pme_order == 5
-#ifndef PME_SSE_UNALIGNED
+#ifndef PME_SIMD4_UNALIGNED
         || pme_order == 4
 #endif
         )
@@ -1598,8 +1619,8 @@ static void set_grid_alignment(int *pmegrid_nz, int pme_order)
 
 static void set_gridsize_alignment(int *gridsize, int pme_order)
 {
-#ifdef PME_SSE_SPREAD_GATHER
-#ifndef PME_SSE_UNALIGNED
+#ifdef PME_SIMD4_SPREAD_GATHER
+#ifndef PME_SIMD4_UNALIGNED
     if (pme_order == 4)
     {
         /* Add extra elements to ensured aligned operations do not go
@@ -1650,7 +1671,7 @@ static void pmegrid_init(pmegrid_t *grid,
     {
         gridsize = grid->s[XX]*grid->s[YY]*grid->s[ZZ];
         set_gridsize_alignment(&gridsize, pme_order);
-        snew_aligned(grid->grid, gridsize, 16);
+        snew_aligned(grid->grid, gridsize, SIMD4_ALIGNMENT);
     }
     else
     {
@@ -1769,7 +1790,7 @@ static void pmegrids_init(pmegrids_t *grids,
         set_gridsize_alignment(&gridsize, pme_order);
         snew_aligned(grids->grid_all,
                      grids->nthread*gridsize+(grids->nthread+1)*GMX_CACHE_SEP,
-                     16);
+                     SIMD4_ALIGNMENT);
 
         for (x = 0; x < grids->nc[XX]; x++)
         {
@@ -1868,6 +1889,8 @@ static void pmegrids_destroy(pmegrids_t *grids)
 
 static void realloc_work(pme_work_t *work, int nkx)
 {
+    int simd_width;
+
     if (nkx > work->nalloc)
     {
         work->nalloc = nkx;
@@ -1879,17 +1902,17 @@ static void realloc_work(pme_work_t *work, int nkx)
          * elements at the end for padding.
          */
 #ifdef PME_SIMD
-#define ALIGN_HERE  GMX_SIMD_WIDTH_HERE
+        simd_width = GMX_SIMD_WIDTH_HERE;
 #else
-/* We can use any alignment, apart from 0, so we use 4 */
-#define ALIGN_HERE  4
+        /* We can use any alignment, apart from 0, so we use 4 */
+        simd_width = 4;
 #endif
         sfree_aligned(work->denom);
         sfree_aligned(work->tmp1);
         sfree_aligned(work->eterm);
-        snew_aligned(work->denom, work->nalloc+ALIGN_HERE, ALIGN_HERE*sizeof(real));
-        snew_aligned(work->tmp1,  work->nalloc+ALIGN_HERE, ALIGN_HERE*sizeof(real));
-        snew_aligned(work->eterm, work->nalloc+ALIGN_HERE, ALIGN_HERE*sizeof(real));
+        snew_aligned(work->denom, work->nalloc+simd_width, simd_width*sizeof(real));
+        snew_aligned(work->tmp1,  work->nalloc+simd_width, simd_width*sizeof(real));
+        snew_aligned(work->eterm, work->nalloc+simd_width, simd_width*sizeof(real));
         srenew(work->m2inv, work->nalloc);
     }
 }
@@ -1918,7 +1941,7 @@ inline static void calc_exponentials(int start, int end, real f, real *d_aligned
         gmx_mm_pr lu;
         gmx_mm_pr tmp_d1, d_inv, tmp_r, tmp_e;
         int kx;
-        f_simd = gmx_load1_pr(&f);
+        f_simd = gmx_set1_pr(f);
         for (kx = 0; kx < end; kx += GMX_SIMD_WIDTH_HERE)
         {
             tmp_d1   = gmx_load_pr(d_aligned+kx);
@@ -2267,6 +2290,14 @@ static void gather_f_bsplines(gmx_pme_t pme, real *grid,
 
     pme_spline_work_t *work;
 
+#if defined PME_SIMD4_SPREAD_GATHER && !defined PME_SIMD4_UNALIGNED
+    real           thz_buffer[12],  *thz_aligned;
+    real           dthz_buffer[12], *dthz_aligned;
+
+    thz_aligned  = gmx_simd4_align_real(thz_buffer);
+    dthz_aligned = gmx_simd4_align_real(dthz_buffer);
+#endif
+
     work = pme->spline_work;
 
     order = pme->pme_order;
@@ -2324,23 +2355,23 @@ static void gather_f_bsplines(gmx_pme_t pme, real *grid,
             switch (order)
             {
                 case 4:
-#ifdef PME_SSE_SPREAD_GATHER
-#ifdef PME_SSE_UNALIGNED
-#define PME_GATHER_F_SSE_ORDER4
+#ifdef PME_SIMD4_SPREAD_GATHER
+#ifdef PME_SIMD4_UNALIGNED
+#define PME_GATHER_F_SIMD4_ORDER4
 #else
-#define PME_GATHER_F_SSE_ALIGNED
+#define PME_GATHER_F_SIMD4_ALIGNED
 #define PME_ORDER 4
 #endif
-#include "pme_sse_single.h"
+#include "pme_simd4.h"
 #else
                     DO_FSPLINE(4);
 #endif
                     break;
                 case 5:
-#ifdef PME_SSE_SPREAD_GATHER
-#define PME_GATHER_F_SSE_ALIGNED
+#ifdef PME_SIMD4_SPREAD_GATHER
+#define PME_GATHER_F_SIMD4_ALIGNED
 #define PME_ORDER 5
-#include "pme_sse_single.h"
+#include "pme_simd4.h"
 #else
                     DO_FSPLINE(5);
 #endif
@@ -3014,29 +3045,32 @@ static pme_spline_work_t *make_pme_spline_work(int order)
 {
     pme_spline_work_t *work;
 
-#ifdef PME_SSE_SPREAD_GATHER
-    float  tmp[8];
-    __m128 zero_SSE;
-    int    of, i;
+#ifdef PME_SIMD4_SPREAD_GATHER
+    real         tmp[12], *tmp_aligned;
+    gmx_simd4_pr zero_S;
+    gmx_simd4_pr real_mask_S0, real_mask_S1;
+    int          of, i;
+
+    snew_aligned(work, 1, SIMD4_ALIGNMENT);
 
-    snew_aligned(work, 1, 16);
+    tmp_aligned = gmx_simd4_align_real(tmp);
 
-    zero_SSE = _mm_setzero_ps();
+    zero_S = gmx_simd4_setzero_pr();
 
     /* Generate bit masks to mask out the unused grid entries,
      * as we only operate on order of the 8 grid entries that are
-     * load into 2 SSE float registers.
+     * load into 2 SIMD registers.
      */
     for (of = 0; of < 8-(order-1); of++)
     {
         for (i = 0; i < 8; i++)
         {
-            tmp[i] = (i >= of && i < of+order ? 1 : 0);
+            tmp_aligned[i] = (i >= of && i < of+order ? -1.0 : 1.0);
         }
-        work->mask_SSE0[of] = _mm_loadu_ps(tmp);
-        work->mask_SSE1[of] = _mm_loadu_ps(tmp+4);
-        work->mask_SSE0[of] = _mm_cmpgt_ps(work->mask_SSE0[of], zero_SSE);
-        work->mask_SSE1[of] = _mm_cmpgt_ps(work->mask_SSE1[of], zero_SSE);
+        real_mask_S0      = gmx_simd4_load_pr(tmp_aligned);
+        real_mask_S1      = gmx_simd4_load_pr(tmp_aligned+4);
+        work->mask_S0[of] = gmx_simd4_cmplt_pr(real_mask_S0, zero_S);
+        work->mask_S1[of] = gmx_simd4_cmplt_pr(real_mask_S1, zero_S);
     }
 #else
     work = NULL;
diff --git a/src/mdlib/pme_simd4.h b/src/mdlib/pme_simd4.h
new file mode 100644
index 0000000000..c66f8e99f6
--- /dev/null
+++ b/src/mdlib/pme_simd4.h
@@ -0,0 +1,356 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2004, The GROMACS development team,
+ * check out http://www.gromacs.org for more information.
+ * Copyright (c) 2012,2013, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+/* This include file has code between ifdef's to make sure
+ * that this performance sensitive code is inlined
+ * and to remove conditionals and variable loop bounds at compile time.
+ */
+
+#ifdef PME_SPREAD_SIMD4_ORDER4
+/* Spread one charge with pme_order=4 with unaligned SIMD4 load+store.
+ * This code does not assume any memory alignment for the grid.
+ */
+{
+    gmx_simd4_pr ty_S0, ty_S1, ty_S2, ty_S3;
+    gmx_simd4_pr tz_S;
+    gmx_simd4_pr vx_S;
+    gmx_simd4_pr vx_tz_S;
+    gmx_simd4_pr sum_S0, sum_S1, sum_S2, sum_S3;
+    gmx_simd4_pr gri_S0, gri_S1, gri_S2, gri_S3;
+
+    ty_S0 = gmx_simd4_set1_pr(thy[0]);
+    ty_S1 = gmx_simd4_set1_pr(thy[1]);
+    ty_S2 = gmx_simd4_set1_pr(thy[2]);
+    ty_S3 = gmx_simd4_set1_pr(thy[3]);
+
+    /* With order 4 the z-spline is actually aligned */
+    tz_S  = gmx_simd4_load_pr(thz);
+
+    for (ithx = 0; (ithx < 4); ithx++)
+    {
+        index_x = (i0+ithx)*pny*pnz;
+        valx    = qn*thx[ithx];
+
+        vx_S   = gmx_simd4_set1_pr(valx);
+
+        vx_tz_S = gmx_simd4_mul_pr(vx_S, tz_S);
+
+        gri_S0 = gmx_simd4_loadu_pr(grid+index_x+(j0+0)*pnz+k0);
+        gri_S1 = gmx_simd4_loadu_pr(grid+index_x+(j0+1)*pnz+k0);
+        gri_S2 = gmx_simd4_loadu_pr(grid+index_x+(j0+2)*pnz+k0);
+        gri_S3 = gmx_simd4_loadu_pr(grid+index_x+(j0+3)*pnz+k0);
+
+        sum_S0 = gmx_simd4_madd_pr(vx_tz_S, ty_S0, gri_S0);
+        sum_S1 = gmx_simd4_madd_pr(vx_tz_S, ty_S1, gri_S1);
+        sum_S2 = gmx_simd4_madd_pr(vx_tz_S, ty_S2, gri_S2);
+        sum_S3 = gmx_simd4_madd_pr(vx_tz_S, ty_S3, gri_S3);
+
+        gmx_simd4_storeu_pr(grid+index_x+(j0+0)*pnz+k0, sum_S0);
+        gmx_simd4_storeu_pr(grid+index_x+(j0+1)*pnz+k0, sum_S1);
+        gmx_simd4_storeu_pr(grid+index_x+(j0+2)*pnz+k0, sum_S2);
+        gmx_simd4_storeu_pr(grid+index_x+(j0+3)*pnz+k0, sum_S3);
+    }
+}
+#undef PME_SPREAD_SIMD4_ORDER4
+#endif
+
+
+#ifdef PME_GATHER_F_SIMD4_ORDER4
+/* Gather for one charge with pme_order=4 with unaligned SIMD4 load+store.
+ * This code does not assume any memory alignment for the grid.
+ */
+{
+    real         fx_tmp[4], fy_tmp[4], fz_tmp[4];
+
+    gmx_simd4_pr fx_S, fy_S, fz_S;
+
+    gmx_simd4_pr tx_S, ty_S, tz_S;
+    gmx_simd4_pr dx_S, dy_S, dz_S;
+
+    gmx_simd4_pr gval_S;
+
+    gmx_simd4_pr fxy1_S;
+    gmx_simd4_pr fz1_S;
+
+    fx_S = gmx_simd4_setzero_pr();
+    fy_S = gmx_simd4_setzero_pr();
+    fz_S = gmx_simd4_setzero_pr();
+
+    /* With order 4 the z-spline is actually aligned */
+    tz_S  = gmx_simd4_load_pr(thz);
+    dz_S  = gmx_simd4_load_pr(dthz);
+
+    for (ithx = 0; (ithx < 4); ithx++)
+    {
+        index_x  = (i0+ithx)*pny*pnz;
+        tx_S   = gmx_simd4_set1_pr(thx[ithx]);
+        dx_S   = gmx_simd4_set1_pr(dthx[ithx]);
+
+        for (ithy = 0; (ithy < 4); ithy++)
+        {
+            index_xy = index_x+(j0+ithy)*pnz;
+            ty_S   = gmx_simd4_set1_pr(thy[ithy]);
+            dy_S   = gmx_simd4_set1_pr(dthy[ithy]);
+
+            gval_S = gmx_simd4_loadu_pr(grid+index_xy+k0);
+
+            fxy1_S = gmx_simd4_mul_pr(tz_S, gval_S);
+            fz1_S  = gmx_simd4_mul_pr(dz_S, gval_S);
+
+            fx_S = gmx_simd4_madd_pr(gmx_simd4_mul_pr(dx_S, ty_S), fxy1_S, fx_S);
+            fy_S = gmx_simd4_madd_pr(gmx_simd4_mul_pr(tx_S, dy_S), fxy1_S, fy_S);
+            fz_S = gmx_simd4_madd_pr(gmx_simd4_mul_pr(tx_S, ty_S), fz1_S, fz_S);
+        }
+    }
+
+    gmx_simd4_storeu_pr(fx_tmp, fx_S);
+    gmx_simd4_storeu_pr(fy_tmp, fy_S);
+    gmx_simd4_storeu_pr(fz_tmp, fz_S);
+
+    fx += fx_tmp[0]+fx_tmp[1]+fx_tmp[2]+fx_tmp[3];
+    fy += fy_tmp[0]+fy_tmp[1]+fy_tmp[2]+fy_tmp[3];
+    fz += fz_tmp[0]+fz_tmp[1]+fz_tmp[2]+fz_tmp[3];
+}
+#undef PME_GATHER_F_SIMD4_ORDER4
+#endif
+
+
+#ifdef PME_SPREAD_SIMD4_ALIGNED
+/* This code assumes that the grid is allocated 4-real aligned
+ * and that pnz is a multiple of 4.
+ * This code supports pme_order <= 5.
+ */
+{
+    int    offset;
+    int    index;
+    gmx_simd4_pr ty_S0, ty_S1, ty_S2, ty_S3, ty_S4;
+    gmx_simd4_pr tz_S0;
+    gmx_simd4_pr tz_S1;
+    gmx_simd4_pr vx_S;
+    gmx_simd4_pr vx_tz_S0;
+    gmx_simd4_pr vx_tz_S1;
+    gmx_simd4_pr sum_S00, sum_S01, sum_S02, sum_S03, sum_S04;
+    gmx_simd4_pr sum_S10, sum_S11, sum_S12, sum_S13, sum_S14;
+    gmx_simd4_pr gri_S00, gri_S01, gri_S02, gri_S03, gri_S04;
+    gmx_simd4_pr gri_S10, gri_S11, gri_S12, gri_S13, gri_S14;
+
+    offset = k0 & 3;
+
+    ty_S0 = gmx_simd4_set1_pr(thy[0]);
+    ty_S1 = gmx_simd4_set1_pr(thy[1]);
+    ty_S2 = gmx_simd4_set1_pr(thy[2]);
+    ty_S3 = gmx_simd4_set1_pr(thy[3]);
+#if PME_ORDER == 5
+    ty_S4 = gmx_simd4_set1_pr(thy[4]);
+#endif
+
+#ifdef GMX_SIMD4_HAVE_UNALIGNED
+    tz_S0 = gmx_simd4_loadu_pr(thz-offset);
+    tz_S1 = gmx_simd4_loadu_pr(thz-offset+4);
+#else
+    {
+        int i;
+        /* Copy thz to an aligned buffer (unused buffer parts are masked) */
+        for (i = 0; i < PME_ORDER; i++)
+        {
+            thz_aligned[offset+i] = thz[i];
+        }
+        tz_S0 = gmx_simd4_load_pr(thz_aligned);
+        tz_S1 = gmx_simd4_load_pr(thz_aligned+4);
+    }
+#endif
+    tz_S0 = gmx_simd4_blendzero_pr(tz_S0, work->mask_S0[offset]);
+    tz_S1 = gmx_simd4_blendzero_pr(tz_S1, work->mask_S1[offset]);
+
+    for (ithx = 0; (ithx < PME_ORDER); ithx++)
+    {
+        index = (i0+ithx)*pny*pnz + j0*pnz + k0 - offset;
+        valx  = qn*thx[ithx];
+
+        vx_S   = gmx_simd4_set1_pr(valx);
+
+        vx_tz_S0 = gmx_simd4_mul_pr(vx_S, tz_S0);
+        vx_tz_S1 = gmx_simd4_mul_pr(vx_S, tz_S1);
+
+        gri_S00 = gmx_simd4_load_pr(grid+index+0*pnz);
+        gri_S01 = gmx_simd4_load_pr(grid+index+1*pnz);
+        gri_S02 = gmx_simd4_load_pr(grid+index+2*pnz);
+        gri_S03 = gmx_simd4_load_pr(grid+index+3*pnz);
+#if PME_ORDER == 5
+        gri_S04 = gmx_simd4_load_pr(grid+index+4*pnz);
+#endif
+        gri_S10 = gmx_simd4_load_pr(grid+index+0*pnz+4);
+        gri_S11 = gmx_simd4_load_pr(grid+index+1*pnz+4);
+        gri_S12 = gmx_simd4_load_pr(grid+index+2*pnz+4);
+        gri_S13 = gmx_simd4_load_pr(grid+index+3*pnz+4);
+#if PME_ORDER == 5
+        gri_S14 = gmx_simd4_load_pr(grid+index+4*pnz+4);
+#endif
+
+        sum_S00 = gmx_simd4_madd_pr(vx_tz_S0, ty_S0, gri_S00);
+        sum_S01 = gmx_simd4_madd_pr(vx_tz_S0, ty_S1, gri_S01);
+        sum_S02 = gmx_simd4_madd_pr(vx_tz_S0, ty_S2, gri_S02);
+        sum_S03 = gmx_simd4_madd_pr(vx_tz_S0, ty_S3, gri_S03);
+#if PME_ORDER == 5
+        sum_S04 = gmx_simd4_madd_pr(vx_tz_S0, ty_S4, gri_S04);
+#endif
+        sum_S10 = gmx_simd4_madd_pr(vx_tz_S1, ty_S0, gri_S10);
+        sum_S11 = gmx_simd4_madd_pr(vx_tz_S1, ty_S1, gri_S11);
+        sum_S12 = gmx_simd4_madd_pr(vx_tz_S1, ty_S2, gri_S12);
+        sum_S13 = gmx_simd4_madd_pr(vx_tz_S1, ty_S3, gri_S13);
+#if PME_ORDER == 5
+        sum_S14 = gmx_simd4_madd_pr(vx_tz_S1, ty_S4, gri_S14);
+#endif
+
+        gmx_simd4_store_pr(grid+index+0*pnz, sum_S00);
+        gmx_simd4_store_pr(grid+index+1*pnz, sum_S01);
+        gmx_simd4_store_pr(grid+index+2*pnz, sum_S02);
+        gmx_simd4_store_pr(grid+index+3*pnz, sum_S03);
+#if PME_ORDER == 5
+        gmx_simd4_store_pr(grid+index+4*pnz, sum_S04);
+#endif
+        gmx_simd4_store_pr(grid+index+0*pnz+4, sum_S10);
+        gmx_simd4_store_pr(grid+index+1*pnz+4, sum_S11);
+        gmx_simd4_store_pr(grid+index+2*pnz+4, sum_S12);
+        gmx_simd4_store_pr(grid+index+3*pnz+4, sum_S13);
+#if PME_ORDER == 5
+        gmx_simd4_store_pr(grid+index+4*pnz+4, sum_S14);
+#endif
+    }
+}
+#undef PME_ORDER
+#undef PME_SPREAD_SIMD4_ALIGNED
+#endif
+
+
+#ifdef PME_GATHER_F_SIMD4_ALIGNED
+/* This code assumes that the grid is allocated 4-real aligned
+ * and that pnz is a multiple of 4.
+ * This code supports pme_order <= 5.
+ */
+{
+    int    offset;
+
+    real         fx_tmp[4], fy_tmp[4], fz_tmp[4];
+
+    gmx_simd4_pr fx_S, fy_S, fz_S;
+
+    gmx_simd4_pr tx_S, ty_S, tz_S0, tz_S1;
+    gmx_simd4_pr dx_S, dy_S, dz_S0, dz_S1;
+
+    gmx_simd4_pr gval_S0;
+    gmx_simd4_pr gval_S1;
+
+    gmx_simd4_pr fxy1_S0;
+    gmx_simd4_pr fz1_S0;
+    gmx_simd4_pr fxy1_S1;
+    gmx_simd4_pr fz1_S1;
+    gmx_simd4_pr fxy1_S;
+    gmx_simd4_pr fz1_S;
+
+    offset = k0 & 3;
+
+    fx_S = gmx_simd4_setzero_pr();
+    fy_S = gmx_simd4_setzero_pr();
+    fz_S = gmx_simd4_setzero_pr();
+
+#ifdef GMX_SIMD4_HAVE_UNALIGNED
+    tz_S0 = gmx_simd4_loadu_pr(thz-offset);
+    tz_S1 = gmx_simd4_loadu_pr(thz-offset+4);
+    dz_S0 = gmx_simd4_loadu_pr(dthz-offset);
+    dz_S1 = gmx_simd4_loadu_pr(dthz-offset+4);
+#else
+    {
+        int i;
+        /* Copy (d)thz to an aligned buffer (unused buffer parts are masked) */
+        for (i = 0; i < PME_ORDER; i++)
+        {
+            thz_aligned[offset+i]  = thz[i];
+            dthz_aligned[offset+i] = dthz[i];
+        }
+        tz_S0 = gmx_simd4_load_pr(thz_aligned);
+        tz_S1 = gmx_simd4_load_pr(thz_aligned+4);
+        dz_S0 = gmx_simd4_load_pr(dthz_aligned);
+        dz_S1 = gmx_simd4_load_pr(dthz_aligned+4);
+    }
+#endif
+    tz_S0 = gmx_simd4_blendzero_pr(tz_S0, work->mask_S0[offset]);
+    dz_S0 = gmx_simd4_blendzero_pr(dz_S0, work->mask_S0[offset]);
+    tz_S1 = gmx_simd4_blendzero_pr(tz_S1, work->mask_S1[offset]);
+    dz_S1 = gmx_simd4_blendzero_pr(dz_S1, work->mask_S1[offset]);
+
+    for (ithx = 0; (ithx < PME_ORDER); ithx++)
+    {
+        index_x  = (i0+ithx)*pny*pnz;
+        tx_S   = gmx_simd4_set1_pr(thx[ithx]);
+        dx_S   = gmx_simd4_set1_pr(dthx[ithx]);
+
+        for (ithy = 0; (ithy < PME_ORDER); ithy++)
+        {
+            index_xy = index_x+(j0+ithy)*pnz;
+            ty_S   = gmx_simd4_set1_pr(thy[ithy]);
+            dy_S   = gmx_simd4_set1_pr(dthy[ithy]);
+
+            gval_S0 = gmx_simd4_load_pr(grid+index_xy+k0-offset);
+            gval_S1 = gmx_simd4_load_pr(grid+index_xy+k0-offset+4);
+
+            fxy1_S0 = gmx_simd4_mul_pr(tz_S0, gval_S0);
+            fz1_S0  = gmx_simd4_mul_pr(dz_S0, gval_S0);
+            fxy1_S1 = gmx_simd4_mul_pr(tz_S1, gval_S1);
+            fz1_S1  = gmx_simd4_mul_pr(dz_S1, gval_S1);
+
+            fxy1_S = gmx_simd4_add_pr(fxy1_S0, fxy1_S1);
+            fz1_S  = gmx_simd4_add_pr(fz1_S0, fz1_S1);
+
+            fx_S = gmx_simd4_madd_pr(gmx_simd4_mul_pr(dx_S, ty_S), fxy1_S, fx_S);
+            fy_S = gmx_simd4_madd_pr(gmx_simd4_mul_pr(tx_S, dy_S), fxy1_S, fy_S);
+            fz_S = gmx_simd4_madd_pr(gmx_simd4_mul_pr(tx_S, ty_S), fz1_S, fz_S);
+        }
+    }
+
+    gmx_simd4_store_pr(fx_tmp, fx_S);
+    gmx_simd4_store_pr(fy_tmp, fy_S);
+    gmx_simd4_store_pr(fz_tmp, fz_S);
+
+    fx += fx_tmp[0]+fx_tmp[1]+fx_tmp[2]+fx_tmp[3];
+    fy += fy_tmp[0]+fy_tmp[1]+fy_tmp[2]+fy_tmp[3];
+    fz += fz_tmp[0]+fz_tmp[1]+fz_tmp[2]+fz_tmp[3];
+}
+#undef PME_ORDER
+#undef PME_GATHER_F_SIMD4_ALIGNED
+#endif
diff --git a/src/mdlib/pme_sse_single.h b/src/mdlib/pme_sse_single.h
deleted file mode 100644
index ae03f54f46..0000000000
--- a/src/mdlib/pme_sse_single.h
+++ /dev/null
@@ -1,325 +0,0 @@
-/*
- * This file is part of the GROMACS molecular simulation package.
- *
- * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
- * Copyright (c) 2001-2004, The GROMACS development team,
- * check out http://www.gromacs.org for more information.
- * Copyright (c) 2012,2013, by the GROMACS development team, led by
- * David van der Spoel, Berk Hess, Erik Lindahl, and including many
- * others, as listed in the AUTHORS file in the top-level source
- * directory and at http://www.gromacs.org.
- *
- * GROMACS is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public License
- * as published by the Free Software Foundation; either version 2.1
- * of the License, or (at your option) any later version.
- *
- * GROMACS is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with GROMACS; if not, see
- * http://www.gnu.org/licenses, or write to the Free Software Foundation,
- * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
- *
- * If you want to redistribute modifications to GROMACS, please
- * consider that scientific software is very special. Version
- * control is crucial - bugs must be traceable. We will be happy to
- * consider code for inclusion in the official distribution, but
- * derived work must not be called official GROMACS. Details are found
- * in the README & COPYING files - if they are missing, get the
- * official version at http://www.gromacs.org.
- *
- * To help us fund GROMACS development, we humbly ask that you cite
- * the research papers on the package. Check out http://www.gromacs.org.
- */
-
-/* This include file has code between ifdef's to make sure
- * that this performance sensitive code is inlined
- * and to remove conditionals and variable loop bounds at compile time.
- */
-
-#ifdef PME_SPREAD_SSE_ORDER4
-/* This code does not assume any memory alignment.
- * This code only works for pme_order = 4.
- */
-{
-    __m128 ty_SSE0, ty_SSE1, ty_SSE2, ty_SSE3;
-    __m128 tz_SSE;
-    __m128 vx_SSE;
-    __m128 vx_tz_SSE;
-    __m128 sum_SSE0, sum_SSE1, sum_SSE2, sum_SSE3;
-    __m128 gri_SSE0, gri_SSE1, gri_SSE2, gri_SSE3;
-
-    ty_SSE0 = _mm_load1_ps(&thy[0]);
-    ty_SSE1 = _mm_load1_ps(&thy[1]);
-    ty_SSE2 = _mm_load1_ps(&thy[2]);
-    ty_SSE3 = _mm_load1_ps(&thy[3]);
-
-    tz_SSE  = _mm_loadu_ps(thz);
-
-    for (ithx = 0; (ithx < 4); ithx++)
-    {
-        index_x = (i0+ithx)*pny*pnz;
-        valx    = qn*thx[ithx];
-
-        vx_SSE   = _mm_load1_ps(&valx);
-
-        vx_tz_SSE = _mm_mul_ps(vx_SSE, tz_SSE);
-
-        gri_SSE0 = _mm_loadu_ps(grid+index_x+(j0+0)*pnz+k0);
-        gri_SSE1 = _mm_loadu_ps(grid+index_x+(j0+1)*pnz+k0);
-        gri_SSE2 = _mm_loadu_ps(grid+index_x+(j0+2)*pnz+k0);
-        gri_SSE3 = _mm_loadu_ps(grid+index_x+(j0+3)*pnz+k0);
-
-        sum_SSE0 = _mm_add_ps(gri_SSE0, _mm_mul_ps(vx_tz_SSE, ty_SSE0));
-        sum_SSE1 = _mm_add_ps(gri_SSE1, _mm_mul_ps(vx_tz_SSE, ty_SSE1));
-        sum_SSE2 = _mm_add_ps(gri_SSE2, _mm_mul_ps(vx_tz_SSE, ty_SSE2));
-        sum_SSE3 = _mm_add_ps(gri_SSE3, _mm_mul_ps(vx_tz_SSE, ty_SSE3));
-
-        _mm_storeu_ps(grid+index_x+(j0+0)*pnz+k0, sum_SSE0);
-        _mm_storeu_ps(grid+index_x+(j0+1)*pnz+k0, sum_SSE1);
-        _mm_storeu_ps(grid+index_x+(j0+2)*pnz+k0, sum_SSE2);
-        _mm_storeu_ps(grid+index_x+(j0+3)*pnz+k0, sum_SSE3);
-    }
-}
-#undef PME_SPREAD_SSE_ORDER4
-#endif
-
-
-#ifdef PME_GATHER_F_SSE_ORDER4
-/* This code does not assume any memory alignment.
- * This code only works for pme_order = 4.
- */
-{
-    float  fx_tmp[4], fy_tmp[4], fz_tmp[4];
-
-    __m128 fx_SSE, fy_SSE, fz_SSE;
-
-    __m128 tx_SSE, ty_SSE, tz_SSE;
-    __m128 dx_SSE, dy_SSE, dz_SSE;
-
-    __m128 gval_SSE;
-
-    __m128 fxy1_SSE;
-    __m128 fz1_SSE;
-
-    fx_SSE = _mm_setzero_ps();
-    fy_SSE = _mm_setzero_ps();
-    fz_SSE = _mm_setzero_ps();
-
-    tz_SSE  = _mm_loadu_ps(thz);
-    dz_SSE  = _mm_loadu_ps(dthz);
-
-    for (ithx = 0; (ithx < 4); ithx++)
-    {
-        index_x  = (i0+ithx)*pny*pnz;
-        tx_SSE   = _mm_load1_ps(thx+ithx);
-        dx_SSE   = _mm_load1_ps(dthx+ithx);
-
-        for (ithy = 0; (ithy < 4); ithy++)
-        {
-            index_xy = index_x+(j0+ithy)*pnz;
-            ty_SSE   = _mm_load1_ps(thy+ithy);
-            dy_SSE   = _mm_load1_ps(dthy+ithy);
-
-            gval_SSE = _mm_loadu_ps(grid+index_xy+k0);
-
-            fxy1_SSE = _mm_mul_ps(tz_SSE, gval_SSE);
-            fz1_SSE  = _mm_mul_ps(dz_SSE, gval_SSE);
-
-            fx_SSE = _mm_add_ps(fx_SSE, _mm_mul_ps(_mm_mul_ps(dx_SSE, ty_SSE), fxy1_SSE));
-            fy_SSE = _mm_add_ps(fy_SSE, _mm_mul_ps(_mm_mul_ps(tx_SSE, dy_SSE), fxy1_SSE));
-            fz_SSE = _mm_add_ps(fz_SSE, _mm_mul_ps(_mm_mul_ps(tx_SSE, ty_SSE), fz1_SSE));
-        }
-    }
-
-    _mm_storeu_ps(fx_tmp, fx_SSE);
-    _mm_storeu_ps(fy_tmp, fy_SSE);
-    _mm_storeu_ps(fz_tmp, fz_SSE);
-
-    fx += fx_tmp[0]+fx_tmp[1]+fx_tmp[2]+fx_tmp[3];
-    fy += fy_tmp[0]+fy_tmp[1]+fy_tmp[2]+fy_tmp[3];
-    fz += fz_tmp[0]+fz_tmp[1]+fz_tmp[2]+fz_tmp[3];
-}
-#undef PME_GATHER_F_SSE_ORDER4
-#endif
-
-
-#ifdef PME_SPREAD_SSE_ALIGNED
-/* This code assumes that the grid is allocated 16 bit aligned
- * and that pnz is a multiple of 4.
- * This code supports pme_order <= 5.
- */
-{
-    int    offset;
-    int    index;
-    __m128 ty_SSE0, ty_SSE1, ty_SSE2, ty_SSE3, ty_SSE4;
-    __m128 tz_SSE0;
-    __m128 tz_SSE1;
-    __m128 vx_SSE;
-    __m128 vx_tz_SSE0;
-    __m128 vx_tz_SSE1;
-    __m128 sum_SSE00, sum_SSE01, sum_SSE02, sum_SSE03, sum_SSE04;
-    __m128 sum_SSE10, sum_SSE11, sum_SSE12, sum_SSE13, sum_SSE14;
-    __m128 gri_SSE00, gri_SSE01, gri_SSE02, gri_SSE03, gri_SSE04;
-    __m128 gri_SSE10, gri_SSE11, gri_SSE12, gri_SSE13, gri_SSE14;
-
-    offset = k0 & 3;
-
-    ty_SSE0 = _mm_load1_ps(&thy[0]);
-    ty_SSE1 = _mm_load1_ps(&thy[1]);
-    ty_SSE2 = _mm_load1_ps(&thy[2]);
-    ty_SSE3 = _mm_load1_ps(&thy[3]);
-#if PME_ORDER == 5
-    ty_SSE4 = _mm_load1_ps(&thy[4]);
-#endif
-
-    tz_SSE0 = _mm_loadu_ps(thz-offset);
-    tz_SSE1 = _mm_loadu_ps(thz-offset+4);
-    tz_SSE0 = _mm_and_ps(tz_SSE0, work->mask_SSE0[offset]);
-    tz_SSE1 = _mm_and_ps(tz_SSE1, work->mask_SSE1[offset]);
-
-    for (ithx = 0; (ithx < PME_ORDER); ithx++)
-    {
-        index = (i0+ithx)*pny*pnz + j0*pnz + k0 - offset;
-        valx  = qn*thx[ithx];
-
-        vx_SSE   = _mm_load1_ps(&valx);
-
-        vx_tz_SSE0 = _mm_mul_ps(vx_SSE, tz_SSE0);
-        vx_tz_SSE1 = _mm_mul_ps(vx_SSE, tz_SSE1);
-
-        gri_SSE00 = _mm_load_ps(grid+index+0*pnz);
-        gri_SSE01 = _mm_load_ps(grid+index+1*pnz);
-        gri_SSE02 = _mm_load_ps(grid+index+2*pnz);
-        gri_SSE03 = _mm_load_ps(grid+index+3*pnz);
-#if PME_ORDER == 5
-        gri_SSE04 = _mm_load_ps(grid+index+4*pnz);
-#endif
-        gri_SSE10 = _mm_load_ps(grid+index+0*pnz+4);
-        gri_SSE11 = _mm_load_ps(grid+index+1*pnz+4);
-        gri_SSE12 = _mm_load_ps(grid+index+2*pnz+4);
-        gri_SSE13 = _mm_load_ps(grid+index+3*pnz+4);
-#if PME_ORDER == 5
-        gri_SSE14 = _mm_load_ps(grid+index+4*pnz+4);
-#endif
-
-        sum_SSE00 = _mm_add_ps(gri_SSE00, _mm_mul_ps(vx_tz_SSE0, ty_SSE0));
-        sum_SSE01 = _mm_add_ps(gri_SSE01, _mm_mul_ps(vx_tz_SSE0, ty_SSE1));
-        sum_SSE02 = _mm_add_ps(gri_SSE02, _mm_mul_ps(vx_tz_SSE0, ty_SSE2));
-        sum_SSE03 = _mm_add_ps(gri_SSE03, _mm_mul_ps(vx_tz_SSE0, ty_SSE3));
-#if PME_ORDER == 5
-        sum_SSE04 = _mm_add_ps(gri_SSE04, _mm_mul_ps(vx_tz_SSE0, ty_SSE4));
-#endif
-        sum_SSE10 = _mm_add_ps(gri_SSE10, _mm_mul_ps(vx_tz_SSE1, ty_SSE0));
-        sum_SSE11 = _mm_add_ps(gri_SSE11, _mm_mul_ps(vx_tz_SSE1, ty_SSE1));
-        sum_SSE12 = _mm_add_ps(gri_SSE12, _mm_mul_ps(vx_tz_SSE1, ty_SSE2));
-        sum_SSE13 = _mm_add_ps(gri_SSE13, _mm_mul_ps(vx_tz_SSE1, ty_SSE3));
-#if PME_ORDER == 5
-        sum_SSE14 = _mm_add_ps(gri_SSE14, _mm_mul_ps(vx_tz_SSE1, ty_SSE4));
-#endif
-
-        _mm_store_ps(grid+index+0*pnz, sum_SSE00);
-        _mm_store_ps(grid+index+1*pnz, sum_SSE01);
-        _mm_store_ps(grid+index+2*pnz, sum_SSE02);
-        _mm_store_ps(grid+index+3*pnz, sum_SSE03);
-#if PME_ORDER == 5
-        _mm_store_ps(grid+index+4*pnz, sum_SSE04);
-#endif
-        _mm_store_ps(grid+index+0*pnz+4, sum_SSE10);
-        _mm_store_ps(grid+index+1*pnz+4, sum_SSE11);
-        _mm_store_ps(grid+index+2*pnz+4, sum_SSE12);
-        _mm_store_ps(grid+index+3*pnz+4, sum_SSE13);
-#if PME_ORDER == 5
-        _mm_store_ps(grid+index+4*pnz+4, sum_SSE14);
-#endif
-    }
-}
-#undef PME_ORDER
-#undef PME_SPREAD_SSE_ALIGNED
-#endif
-
-
-#ifdef PME_GATHER_F_SSE_ALIGNED
-/* This code assumes that the grid is allocated 16 bit aligned
- * and that pnz is a multiple of 4.
- * This code supports pme_order <= 5.
- */
-{
-    int    offset;
-
-    float  fx_tmp[4], fy_tmp[4], fz_tmp[4];
-
-    __m128 fx_SSE, fy_SSE, fz_SSE;
-
-    __m128 tx_SSE, ty_SSE, tz_SSE0, tz_SSE1;
-    __m128 dx_SSE, dy_SSE, dz_SSE0, dz_SSE1;
-
-    __m128 gval_SSE0;
-    __m128 gval_SSE1;
-
-    __m128 fxy1_SSE0;
-    __m128 fz1_SSE0;
-    __m128 fxy1_SSE1;
-    __m128 fz1_SSE1;
-    __m128 fxy1_SSE;
-    __m128 fz1_SSE;
-
-    offset = k0 & 3;
-
-    fx_SSE = _mm_setzero_ps();
-    fy_SSE = _mm_setzero_ps();
-    fz_SSE = _mm_setzero_ps();
-
-    tz_SSE0 = _mm_loadu_ps(thz-offset);
-    dz_SSE0 = _mm_loadu_ps(dthz-offset);
-    tz_SSE1 = _mm_loadu_ps(thz-offset+4);
-    dz_SSE1 = _mm_loadu_ps(dthz-offset+4);
-    tz_SSE0 = _mm_and_ps(tz_SSE0, work->mask_SSE0[offset]);
-    dz_SSE0 = _mm_and_ps(dz_SSE0, work->mask_SSE0[offset]);
-    tz_SSE1 = _mm_and_ps(tz_SSE1, work->mask_SSE1[offset]);
-    dz_SSE1 = _mm_and_ps(dz_SSE1, work->mask_SSE1[offset]);
-
-    for (ithx = 0; (ithx < PME_ORDER); ithx++)
-    {
-        index_x  = (i0+ithx)*pny*pnz;
-        tx_SSE   = _mm_load1_ps(thx+ithx);
-        dx_SSE   = _mm_load1_ps(dthx+ithx);
-
-        for (ithy = 0; (ithy < PME_ORDER); ithy++)
-        {
-            index_xy = index_x+(j0+ithy)*pnz;
-            ty_SSE   = _mm_load1_ps(thy+ithy);
-            dy_SSE   = _mm_load1_ps(dthy+ithy);
-
-            gval_SSE0 = _mm_load_ps(grid+index_xy+k0-offset);
-            gval_SSE1 = _mm_load_ps(grid+index_xy+k0-offset+4);
-
-            fxy1_SSE0 = _mm_mul_ps(tz_SSE0, gval_SSE0);
-            fz1_SSE0  = _mm_mul_ps(dz_SSE0, gval_SSE0);
-            fxy1_SSE1 = _mm_mul_ps(tz_SSE1, gval_SSE1);
-            fz1_SSE1  = _mm_mul_ps(dz_SSE1, gval_SSE1);
-
-            fxy1_SSE = _mm_add_ps(fxy1_SSE0, fxy1_SSE1);
-            fz1_SSE  = _mm_add_ps(fz1_SSE0, fz1_SSE1);
-
-            fx_SSE = _mm_add_ps(fx_SSE, _mm_mul_ps(_mm_mul_ps(dx_SSE, ty_SSE), fxy1_SSE));
-            fy_SSE = _mm_add_ps(fy_SSE, _mm_mul_ps(_mm_mul_ps(tx_SSE, dy_SSE), fxy1_SSE));
-            fz_SSE = _mm_add_ps(fz_SSE, _mm_mul_ps(_mm_mul_ps(tx_SSE, ty_SSE), fz1_SSE));
-        }
-    }
-
-    _mm_store_ps(fx_tmp, fx_SSE);
-    _mm_store_ps(fy_tmp, fy_SSE);
-    _mm_store_ps(fz_tmp, fz_SSE);
-
-    fx += fx_tmp[0]+fx_tmp[1]+fx_tmp[2]+fx_tmp[3];
-    fy += fy_tmp[0]+fy_tmp[1]+fy_tmp[2]+fy_tmp[3];
-    fz += fz_tmp[0]+fz_tmp[1]+fz_tmp[2]+fz_tmp[3];
-}
-#undef PME_ORDER
-#undef PME_GATHER_F_SSE_ALIGNED
-#endif