Merge "Avoid using undefined CMake variables" into release-4-6
authorSzilárd Páll <pszilard@kth.se>
Wed, 19 Dec 2012 14:38:23 +0000 (15:38 +0100)
committerGerrit Code Review <gerrit@gerrit.gromacs.org>
Wed, 19 Dec 2012 14:38:23 +0000 (15:38 +0100)
27 files changed:
cmake/gmxCheckGCCVersion.cmake [deleted file]
include/copyrite.h
include/gmx_simd_macros.h [moved from include/gmx_x86_simd_macros.h with 89% similarity]
include/gmx_x86_avx_256.h
include/types/nb_verlet.h
src/gmxlib/copyrite.c
src/gmxlib/gmx_detect_hardware.c
src/kernel/calc_verletbuf.c
src/kernel/pme_loadbal.c
src/mdlib/forcerec.c
src/mdlib/nbnxn_atomdata.c
src/mdlib/nbnxn_internal.h
src/mdlib/nbnxn_kernels/nbnxn_kernel_simd_2xnn.c [moved from src/mdlib/nbnxn_kernels/nbnxn_kernel_x86_simd128.c with 83% similarity]
src/mdlib/nbnxn_kernels/nbnxn_kernel_simd_2xnn.h [moved from src/mdlib/nbnxn_kernels/nbnxn_kernel_x86_simd128.h with 69% similarity]
src/mdlib/nbnxn_kernels/nbnxn_kernel_simd_2xnn_includes.h [new file with mode: 0644]
src/mdlib/nbnxn_kernels/nbnxn_kernel_simd_2xnn_inner.h [new file with mode: 0644]
src/mdlib/nbnxn_kernels/nbnxn_kernel_simd_2xnn_outer.h [new file with mode: 0644]
src/mdlib/nbnxn_kernels/nbnxn_kernel_simd_4xn.c [moved from src/mdlib/nbnxn_kernels/nbnxn_kernel_x86_simd256.c with 84% similarity]
src/mdlib/nbnxn_kernels/nbnxn_kernel_simd_4xn.h [moved from src/mdlib/nbnxn_kernels/nbnxn_kernel_x86_simd256.h with 71% similarity]
src/mdlib/nbnxn_kernels/nbnxn_kernel_simd_4xn_includes.h [moved from src/mdlib/nbnxn_kernels/nbnxn_kernel_x86_simd_includes.h with 86% similarity]
src/mdlib/nbnxn_kernels/nbnxn_kernel_simd_4xn_inner.h [moved from src/mdlib/nbnxn_kernels/nbnxn_kernel_x86_simd_inner.h with 99% similarity]
src/mdlib/nbnxn_kernels/nbnxn_kernel_simd_4xn_outer.h [moved from src/mdlib/nbnxn_kernels/nbnxn_kernel_x86_simd_outer.h with 95% similarity]
src/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils.h [moved from src/mdlib/nbnxn_kernels/nbnxn_kernel_x86_simd_utils.h with 90% similarity]
src/mdlib/nbnxn_search.c
src/mdlib/nbnxn_search_simd_2xnn.h [new file with mode: 0644]
src/mdlib/nbnxn_search_simd_4xn.h [moved from src/mdlib/nbnxn_search_x86_simd.h with 76% similarity]
src/mdlib/sim_util.c

diff --git a/cmake/gmxCheckGCCVersion.cmake b/cmake/gmxCheckGCCVersion.cmake
deleted file mode 100644 (file)
index f9223ef..0000000
+++ /dev/null
@@ -1,81 +0,0 @@
-#
-# This file is part of the GROMACS molecular simulation package.
-#
-# Copyright (c) 2012, by the GROMACS development team, led by
-# David van der Spoel, Berk Hess, Erik Lindahl, and including many
-# others, as listed in the AUTHORS file in the top-level source
-# directory and at http://www.gromacs.org.
-#
-# GROMACS is free software; you can redistribute it and/or
-# modify it under the terms of the GNU Lesser General Public License
-# as published by the Free Software Foundation; either version 2.1
-# of the License, or (at your option) any later version.
-#
-# GROMACS is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-# Lesser General Public License for more details.
-#
-# You should have received a copy of the GNU Lesser General Public
-# License along with GROMACS; if not, see
-# http://www.gnu.org/licenses, or write to the Free Software Foundation,
-# Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
-#
-# If you want to redistribute modifications to GROMACS, please
-# consider that scientific software is very special. Version
-# control is crucial - bugs must be traceable. We will be happy to
-# consider code for inclusion in the official distribution, but
-# derived work must not be called official GROMACS. Details are found
-# in the README & COPYING files - if they are missing, get the
-# official version at http://www.gromacs.org.
-#
-# To help us fund GROMACS development, we humbly ask that you cite
-# the research papers on the package. Check out http://www.gromacs.org.
-#
-# Check GCC version and if any of the 4.1.x family compiler suites is found
-# quit the build system generating process. 
-#
-# The GCC 4.1.x compilers contain an optimization related bug which might 
-# results in code that exhibits incorrect behaviour and often leads to 
-# exploding systems or crashes. 
-#
-# For further details see e.g. 
-# https://bugs.launchpad.net/ubuntu/+source/gcc-4.1/+bug/158799
-#
-# Szilard Pall (pszilard@cbr.su.se)
-#
-
-if(NOT GMX_DISABLE_GCC41_CHECK)
-
-if(CMAKE_COMPILER_IS_GNUCC)
-    # if we have -dumpversion flag use that, otherwise try the --version
-    execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion
-        RESULT_VARIABLE _gcc_dumpversion_res
-        OUTPUT_VARIABLE _gcc_dumpversion_out
-        OUTPUT_STRIP_TRAILING_WHITESPACE)
-    # if gcc returned with error the -dumpversion is not available 
-    if(${_gcc_dumpversion_res} EQUAL 0)
-        if(${_gcc_dumpversion_out} MATCHES ".*4\\.1\\.[0-9]+.*")
-            message(FATAL_ERROR " The GCC compiler in use seems to belong to the 4.1.x 
-                family (detected version: ${_gcc_dumpversion_out}). These compilers 
-                contain an optimization related bug which might results in code that 
-                exhibits incorrect behaviour and often leads to exploding systems or 
-                crashes. To disable this check set GMX_DISABLE_GCC41_CHECK=YES.")
-        endif()
-    else()    
-        message(WARNING " The GCC compiler in use does not support the -dumpversion flag. 
-            Will attempt parsing the version from the \"gcc --version\" output.")        
-        execute_process(COMMAND ${CMAKE_C_COMPILER} --version
-            OUTPUT_VARIABLE _gcc_version_out
-            OUTPUT_STRIP_TRAILING_WHITESPACE)            
-        if("${_gcc_version_out}" MATCHES ".*4\\.1\\.[0-9]+.*")
-            message(FATAL_ERROR " The GCC compiler in use seems to belong to the 4.1.x 
-                family. These compiler  compilers contain an optimization related bug 
-                which might results in code that exhibits incorrect behaviour and 
-                often leads to exploding systems or crashes. To disable this check set 
-                GMX_DISABLE_GCC41_CHECK=YES.")
-        endif()
-    endif()
-endif()
-
-endif()
index 994734fb533a9de1b7a1dbad384192e6b1045ebf..ab10fa9947144ef6336d9cf122ae1d8cdb81ff2e 100644 (file)
@@ -70,10 +70,10 @@ CopyrightText[] = {
 };
 
 static const char *
-GPLText[] = {
+LicenseText[] = {
   "This program is free software; you can redistribute it and/or",
-  "modify it under the terms of the GNU General Public License",
-  "as published by the Free Software Foundation; either version 2",
+  "modify it under the terms of the GNU Lesser General Public License",
+  "as published by the Free Software Foundation; either version 2.1",
   "of the License, or (at your option) any later version."
 };
 
similarity index 89%
rename from include/gmx_x86_simd_macros.h
rename to include/gmx_simd_macros.h
index fd85c6e5e6422db3e173628086db42b14c75bd28..2cd5ed91cb987e38ea40b554b22f042ace3c7c7c 100644 (file)
  * the research papers on the package. Check out http://www.gromacs.org.
  */
 
+/* The macros in this file are intended to be used for writing
+ * architecture independent SIMD intrinsics code.
+ * To support a new architecture, adding macros here should be (nearly)
+ * all that is needed.
+ */
+
 /* Undefine all defines used below so we can include this file multiple times
  * with different settings from the same source file.
  */
 
 /* NOTE: floor and blend are NOT available with SSE2 only acceleration */
 
-#undef GMX_X86_SIMD_WIDTH_HERE
+#undef GMX_SIMD_WIDTH_HERE
 
 #undef gmx_epi32
 
  */
 
 #if !defined GMX_MM128_HERE && !defined GMX_MM256_HERE
-"You should define GMX_MM128_HERE or GMX_MM256_HERE"
+#error "You should define GMX_MM128_HERE or GMX_MM256_HERE"
 #endif
 
 #if defined GMX_MM128_HERE && defined GMX_MM256_HERE
-"You should not define both GMX_MM128_HERE and GMX_MM256_HERE"
+#error "You should not define both GMX_MM128_HERE and GMX_MM256_HERE"
 #endif
 
 #ifdef GMX_MM128_HERE
 
 #include "gmx_x86_simd_single.h"
 
-#define GMX_X86_SIMD_WIDTH_HERE  4
+#define GMX_SIMD_WIDTH_HERE  4
 
 #define gmx_mm_pr  __m128
 
 
 #include "gmx_x86_simd_double.h"
 
-#define GMX_X86_SIMD_WIDTH_HERE  2
+#define GMX_SIMD_WIDTH_HERE  2
 
 #define gmx_mm_pr  __m128d
 
 
 #include "gmx_x86_simd_single.h"
 
-#define GMX_X86_SIMD_WIDTH_HERE  8
+#define GMX_SIMD_WIDTH_HERE  8
 
 #define gmx_mm_pr  __m256
 
 #define gmx_pmecorrF_pr   gmx_mm256_pmecorrF_ps
 #define gmx_pmecorrV_pr   gmx_mm256_pmecorrV_ps
 
+#define gmx_loaddh_pr     gmx_mm256_load4_ps
+
+/* Half SIMD-width type */
+#define gmx_mm_hpr  __m128
+
+/* Half SIMD-width macros */
+#define gmx_load_hpr      _mm_load_ps
+#define gmx_load1_hpr(x)  _mm_set1_ps((x)[0])
+#define gmx_store_hpr     _mm_store_ps
+#define gmx_add_hpr       _mm_add_ps
+#define gmx_sub_hpr       _mm_sub_ps
+
+#define gmx_sum4_hpr      gmx_mm256_sum4h_m128
+
+/* Conversion between half and full SIMD-width */
+#define gmx_2hpr_to_pr    gmx_mm256_set_m128
+
 #else
 
 #include "gmx_x86_simd_double.h"
 
-#define GMX_X86_SIMD_WIDTH_HERE  4
+#define GMX_SIMD_WIDTH_HERE  4
 
 #define gmx_mm_pr  __m256d
 
index 1e444f46e9478a4ede3a2c98ae07ae7c52e9cc35..9f266e834ff031090a34f3aadd91d315e4ae6208 100644 (file)
@@ -128,6 +128,16 @@ gmx_mm256_set_m128(__m128 hi, __m128 lo)
 }
 
 
+static gmx_inline __m256
+gmx_mm256_load4_ps(float const * p)
+{
+    __m128 a;
+
+    a = _mm_load_ps(p);
+    return _mm256_insertf128_ps(_mm256_castps128_ps256(a), a, 0x1);
+}
+
+
 static __m256d
 gmx_mm256_unpack128lo_pd(__m256d xmm1, __m256d xmm2)
 {
@@ -147,6 +157,13 @@ gmx_mm256_set_m128d(__m128d hi, __m128d lo)
 }
 
 
+static __m128 gmx_mm256_sum4h_m128(__m256 x, __m256 y)
+{
+    __m256 sum;
+
+    sum = _mm256_add_ps(x,y);
+    return _mm_add_ps(_mm256_castps256_ps128(sum),_mm256_extractf128_ps(sum,0x1));
+}
 
 
 static void
index 32dd9432d76f2b685b1180eadaaf7589cc39ea34..8a33375c03f04564c62fabe666a5a2ddcf63f8a3 100644 (file)
 extern "C" {
 #endif
 
-/*! Nonbonded NxN kernel types: plain C, SSE/AVX, GPU CUDA, GPU emulation, etc */
-enum { nbkNotSet = 0, 
-       nbk4x4_PlainC, 
-       nbk4xN_X86_SIMD128,
-       nbk4xN_X86_SIMD256,
-       nbk8x8x8_CUDA,
-       nbk8x8x8_PlainC };
+#ifdef GMX_X86_SSE2
+/* Use SIMD accelerated nbnxn search and kernels */
+#define GMX_NBNXN_SIMD
+
+#ifdef GMX_X86_AVX_256
+/* Comment out this define to use AVX-128 kernels with AVX-256 acceleration */
+#define GMX_NBNXN_SIMD_BITWIDTH  256
+#else
+#define GMX_NBNXN_SIMD_BITWIDTH  128
+#endif
+
+/* The nbnxn SIMD 4xN and 2x(N+N) kernels can be added independently.
+ * Currently the 2xNN SIMD kernels only make sense and are only implemented
+ * with AVX-256 in single precision using a 4x4 cluster setup instead of 4x8.
+ */
+#define GMX_NBNXN_SIMD_4XN
+#if GMX_NBNXN_SIMD_BITWIDTH == 256 && !defined GMX_DOUBLE
+#define GMX_NBNXN_SIMD_2XNN
+#endif
+
+#endif
+
+
+/*! Nonbonded NxN kernel types: plain C, CPU SIMD, GPU CUDA, GPU emulation */
+typedef enum
+{
+    nbnxnkNotSet = 0, 
+    nbnxnk4x4_PlainC, 
+    nbnxnk4xN_SIMD_4xN,
+    nbnxnk4xN_SIMD_2xNN,
+    nbnxnk8x8x8_CUDA,
+    nbnxnk8x8x8_PlainC,
+    nbnxnkNR
+} nbnxn_kernel_type;
 
 /* Note that _mm_... intrinsics can be converted to either SSE or AVX
  * depending on compiler flags.
  * For gcc we check for __AVX__
  * At least a check for icc should be added (if there is a macro)
  */
-static const char *nbk_name[] =
-  { "not set", "plain C 4x4",
-#if !(defined GMX_X86_AVX_256 || defined GMX_X86_AVX128_FMA || defined __AVX__)
+static const char *nbnxn_kernel_name[nbnxnkNR] =
+  { "not set", "plain C",
+#if !(defined GMX_X86_SSE2)
+    "not available", "not available",
+#else
+#if GMX_NBNXN_SIMD_BITWIDTH == 128
+#if !(defined GMX_X86_AVX_128_FMA || defined __AVX__)
 #ifndef GMX_X86_SSE4_1
-#ifndef GMX_DOUBLE
-    "SSE2 4x4",
+    "SSE2", "SSE2",
 #else
-    "SSE2 4x2",
+    "SSE4.1", "SSE4.1",
 #endif
 #else
-#ifndef GMX_DOUBLE
-    "SSE4.1 4x4",
-#else
-    "SSE4.1 4x2",
+    "AVX-128", "AVX-128",
 #endif
-#endif
-#else
-#ifndef GMX_DOUBLE
-    "AVX-128 4x4",
 #else
-    "AVX-128 4x2",
+    "AVX-256",  "AVX-256",
 #endif
 #endif
-#ifndef GMX_DOUBLE
-    "AVX-256 4x8",
-#else
-    "AVX-256 4x4",
-#endif
-    "CUDA 8x8x8", "plain C 8x8x8" };
+    "CUDA", "plain C" };
 
 enum { ewaldexclTable, ewaldexclAnalytical };
 
@@ -119,9 +137,9 @@ typedef struct {
 
 /* non-bonded data structure with Verlet-type cut-off */
 typedef struct {
-    nbnxn_search_t           nbs;   /* n vs n atom pair searching data          */
-    int                      ngrp;  /* number of interaction groups             */
-    nonbonded_verlet_group_t grp[2];/* local and non-local interaction group    */
+    nbnxn_search_t           nbs;   /* n vs n atom pair searching data       */
+    int                      ngrp;  /* number of interaction groups          */
+    nonbonded_verlet_group_t grp[2];/* local and non-local interaction group */
 
     gmx_bool         bUseGPU;          /* TRUE when GPU acceleration is used */
     nbnxn_cuda_ptr_t cu_nbv;           /* pointer to CUDA nb verlet data     */
index 574dcee2f7a475255003a94e841ba10c67a78ba2..f67f28d7d53481aae7001fd752bf1845d99998e1 100644 (file)
@@ -237,10 +237,11 @@ void CopyRight(FILE *out,const char *szProgram)
    * name of a file. Otherwise, we won't be able to find the library dir.
    */
 #define NCR (int)asize(CopyrightText)
+/* TODO: Is this exception still needed? */
 #ifdef GMX_FAHCORE
-#define NGPL 0 /*FAH has an exception permission from GPL to allow digital signatures in Gromacs*/
+#define NLICENSE 0 /*FAH has an exception permission from GPL to allow digital signatures in Gromacs*/
 #else
-#define NGPL (int)asize(GPLText)
+#define NLICENSE (int)asize(LicenseText)
 #endif
 
   char buf[256],tmpstr[1024];
@@ -270,8 +271,8 @@ void CopyRight(FILE *out,const char *szProgram)
 
   for(i=0; (i<NCR); i++) 
     sp_print(out,CopyrightText[i]);
-  for(i=0; (i<NGPL); i++)
-    sp_print(out,GPLText[i]);
+  for(i=0; (i<NLICENSE); i++)
+    sp_print(out,LicenseText[i]);
 
   fprintf(out,"\n");
 
index 21e0c649b92a5c6e8c80c0580e68b7c3aea45c1f..e4fef1fe00021f3d7d061d016d0a5edd431b20ed 100644 (file)
@@ -63,6 +63,9 @@
  * ridiculous number. */
 static unsigned int max_gpu_ids_user = 64;
 
+static const char* invalid_gpuid_hint =
+    "A delimiter-free sequence of valid numeric IDs of available GPUs is expected.";
+
 /* FW decl. */
 void limit_num_gpus_used(gmx_hw_info_t *hwinfo, int count);
 
@@ -173,7 +176,8 @@ static void parse_gpu_id_plain_string(const char *idstr, int *nid, int *idlist)
     {
         if (idstr[i] < '0' || idstr[i] > '9')
         {
-            gmx_fatal(FARGS, "Invalid character in GPU ID string: '%c'\n", idstr[i]);
+            gmx_fatal(FARGS, "Invalid character in GPU ID string: '%c'\n%s\n",
+                      invalid_gpuid_hint, idstr[i]);
         }
         idlist[i] = idstr[i] - '0';
     }
@@ -492,10 +496,15 @@ void gmx_detect_hardware(FILE *fplog, gmx_hw_info_t *hwinfo,
     bGPUBin      = FALSE;
 #endif
 
-    /* Bail if binary is not compiled with GPU on */
+    /* Bail if binary is not compiled with GPU acceleration, but this is either
+     * explicitly (-nb gpu) or implicitly (gpu ID passed) requested. */
     if (bForceUseGPU && !bGPUBin)
     {
-        gmx_fatal_collective(FARGS, cr, NULL, "GPU acceleration requested, but %s was compiled without GPU support!", ShortProgram());
+        gmx_fatal(FARGS, "GPU acceleration requested, but %s was compiled without GPU support!", ShortProgram());
+    }
+    if (gpu_id != NULL && !bGPUBin)
+    {
+        gmx_fatal(FARGS, "GPU ID string set, but %s was compiled without GPU support!", ShortProgram());
     }
 
     /* run the detection if the binary was compiled with GPU support */
@@ -545,7 +554,7 @@ void gmx_detect_hardware(FILE *fplog, gmx_hw_info_t *hwinfo,
 
             if (nid == 0)
             {
-                gmx_fatal(FARGS, "Empty GPU ID string passed\n");
+                gmx_fatal(FARGS, "Empty GPU ID string encountered.\n%s\n", invalid_gpuid_hint);
             }
 
             res = check_select_cuda_gpus(checkres, &hwinfo->gpu_info, gpuid, nid);
index 8a1f6ef00d00d85bdf820e3fb77d17e8a12761f9..68e3cf628ce288301a49a83f34d0e03dea767f63 100644 (file)
@@ -78,17 +78,14 @@ void verletbuf_get_list_setup(gmx_bool bGPU,
     }
     else
     {
-#ifndef GMX_X86_SSE2
+#ifndef GMX_NBNXN_SIMD
         list_setup->cluster_size_j = NBNXN_CPU_CLUSTER_I_SIZE;
 #else
-        int simd_width;
-
-#ifdef GMX_X86_AVX_256
-        simd_width = 256;
-#else
-        simd_width = 128;
+        list_setup->cluster_size_j = GMX_NBNXN_SIMD_BITWIDTH/(sizeof(real)*8);
+#ifdef GMX_NBNXN_SIMD_2XNN
+        /* We assume the smallest cluster size to be on the safe side */
+        list_setup->cluster_size_j /= 2;
 #endif
-        list_setup->cluster_size_j = simd_width/(sizeof(real)*8);
 #endif
     }
 }
index 6367ac25bde5507880f7ce7019f46c5fbb323a67..a208f08c4f2e96178ff8359fafdb04b5e5356abd 100644 (file)
@@ -624,7 +624,8 @@ gmx_bool pme_load_balance(pme_load_balancing_t pme_lb,
     ic->ewaldcoeff = set->ewaldcoeff;
 
     bUsesSimpleTables = uses_simple_tables(ir->cutoff_scheme, nbv, 0);
-    if (pme_lb->cutoff_scheme == ecutsVERLET && nbv->grp[0].kernel_type == nbk8x8x8_CUDA)
+    if (pme_lb->cutoff_scheme == ecutsVERLET &&
+        nbv->grp[0].kernel_type == nbnxnk8x8x8_CUDA)
     {
         nbnxn_cuda_pme_loadbal_update_param(nbv->cu_nbv,ic);
     }
index b773dff15f9f2889031140b4f4fd57cab2f674bf..ce27dda2b7fdbae5fe1a67b6a648cb4711e2ba6b 100644 (file)
@@ -1403,41 +1403,48 @@ static void init_forcerec_f_threads(t_forcerec *fr,int nenergrp)
 static void pick_nbnxn_kernel_cpu(FILE *fp,
                                   const t_commrec *cr,
                                   const gmx_cpuid_t cpuid_info,
+                                  const t_inputrec *ir,
                                   int *kernel_type,
                                   int *ewald_excl)
 {
-    *kernel_type = nbk4x4_PlainC;
+    *kernel_type = nbnxnk4x4_PlainC;
     *ewald_excl  = ewaldexclTable;
 
-#ifdef GMX_X86_SSE2
+#ifdef GMX_NBNXN_SIMD
     {
-        /* On Intel Sandy-Bridge AVX-256 kernels are always faster.
-         * On AMD Bulldozer AVX-256 is much slower than AVX-128.
-         */
-        if(gmx_cpuid_feature(cpuid_info, GMX_CPUID_FEATURE_X86_AVX) == 1 &&
-           gmx_cpuid_vendor(cpuid_info) != GMX_CPUID_VENDOR_AMD)
-        {
-#ifdef GMX_X86_AVX_256
-            *kernel_type = nbk4xN_X86_SIMD256;
-#else
-            *kernel_type = nbk4xN_X86_SIMD128;
+#ifdef GMX_NBNXN_SIMD_4XN
+        *kernel_type = nbnxnk4xN_SIMD_4xN;
 #endif
-        }
-        else
+#ifdef GMX_NBNXN_SIMD_2XNN
+        /* We expect the 2xNN kernels to be faster in most cases */
+        *kernel_type = nbnxnk4xN_SIMD_2xNN;
+#endif
+
+#if defined GMX_NBNXN_SIMD_4XN && defined GMX_X86_AVX_256
+        if (EEL_RF(ir->coulombtype) || ir->coulombtype == eelCUT)
         {
-            *kernel_type = nbk4xN_X86_SIMD128;
+            /* The raw pair rate of the 4x8 kernel is higher than 2x(4+4),
+             * 10% with HT, 50% without HT, but extra zeros interactions
+             * can compensate. As we currently don't detect the actual use
+             * of HT, switch to 4x8 to avoid a potential performance hit.
+             */
+            *kernel_type = nbnxnk4xN_SIMD_4xN;
         }
-
-        if (getenv("GMX_NBNXN_AVX128") != NULL)
+#endif
+        if (getenv("GMX_NBNXN_SIMD_4XN") != NULL)
         {
-            *kernel_type = nbk4xN_X86_SIMD128;
+#ifdef GMX_NBNXN_SIMD_2XNN
+            *kernel_type = nbnxnk4xN_SIMD_4xN;
+#else
+            gmx_fatal(FARGS,"SIMD 4xN kernels requested, but Gromacs has been compiled without support for these kernels");
+#endif
         }
-        if (getenv("GMX_NBNXN_AVX256") != NULL)
+        if (getenv("GMX_NBNXN_SIMD_2XNN") != NULL)
         {
-#ifdef GMX_X86_AVX_256
-            *kernel_type = nbk4xN_X86_SIMD256;
+#ifdef GMX_NBNXN_SIMD_2XNN
+            *kernel_type = nbnxnk4xN_SIMD_2xNN;
 #else
-            gmx_fatal(FARGS,"You requested AVX-256 nbnxn kernels, but GROMACS was built without AVX support");
+            gmx_fatal(FARGS,"SIMD 2x(N+N) kernels requested, but Gromacs has been compiled without support for these kernels");
 #endif
         }
 
@@ -1466,6 +1473,7 @@ static void pick_nbnxn_kernel(FILE *fp,
                               const gmx_hw_info_t *hwinfo,
                               gmx_bool use_cpu_acceleration,
                               gmx_bool *bUseGPU,
+                              const t_inputrec *ir,
                               int *kernel_type,
                               int *ewald_excl,
                               gmx_bool bDoNonbonded)
@@ -1475,7 +1483,7 @@ static void pick_nbnxn_kernel(FILE *fp,
 
     assert(kernel_type);
 
-    *kernel_type = nbkNotSet;
+    *kernel_type = nbnxnkNotSet;
     *ewald_excl  = ewaldexclTable;
 
     bEmulateGPUEnvVarSet = (getenv("GMX_EMULATE_GPU") != NULL);
@@ -1521,7 +1529,7 @@ static void pick_nbnxn_kernel(FILE *fp,
 
     if (bEmulateGPU)
     {
-        *kernel_type = nbk8x8x8_PlainC;
+        *kernel_type = nbnxnk8x8x8_PlainC;
 
         if (bDoNonbonded)
         {
@@ -1530,31 +1538,28 @@ static void pick_nbnxn_kernel(FILE *fp,
     }
     else if (bGPU)
     {
-        *kernel_type = nbk8x8x8_CUDA;
+        *kernel_type = nbnxnk8x8x8_CUDA;
     }
 
-    if (*kernel_type == nbkNotSet)
+    if (*kernel_type == nbnxnkNotSet)
     {
         if (use_cpu_acceleration)
         {
-            pick_nbnxn_kernel_cpu(fp,cr,hwinfo->cpuid_info,
+            pick_nbnxn_kernel_cpu(fp,cr,hwinfo->cpuid_info,ir,
                                   kernel_type,ewald_excl);
         }
         else
         {
-            *kernel_type = nbk4x4_PlainC;
+            *kernel_type = nbnxnk4x4_PlainC;
         }
     }
 
     if (bDoNonbonded && fp != NULL)
     {
-        if (MASTER(cr))
-        {
-            fprintf(stderr,"Using %s non-bonded kernels\n",
-                    nbk_name[*kernel_type]);
-        }
-        fprintf(fp,"\nUsing %s non-bonded kernels\n\n",
-                nbk_name[*kernel_type]);
+        fprintf(fp,"\nUsing %s %dx%d non-bonded kernels\n\n",
+                nbnxn_kernel_name[*kernel_type],
+                nbnxn_kernel_pairlist_simple(*kernel_type) ? NBNXN_CPU_CLUSTER_I_SIZE : NBNXN_GPU_CLUSTER_SIZE,
+                nbnxn_kernel_to_cj_size(*kernel_type));
     }
 }
 
@@ -1754,12 +1759,13 @@ static void init_nb_verlet(FILE *fp,
     {
         nbv->grp[i].nbl_lists.nnbl = 0;
         nbv->grp[i].nbat           = NULL;
-        nbv->grp[i].kernel_type    = nbkNotSet;
+        nbv->grp[i].kernel_type    = nbnxnkNotSet;
 
         if (i == 0) /* local */
         {
             pick_nbnxn_kernel(fp, cr, fr->hwinfo, fr->use_cpu_acceleration,
                               &nbv->bUseGPU,
+                              ir,
                               &nbv->grp[i].kernel_type,
                               &nbv->grp[i].ewald_excl,
                               fr->bNonbonded);
@@ -1771,6 +1777,7 @@ static void init_nb_verlet(FILE *fp,
                 /* Use GPU for local, select a CPU kernel for non-local */
                 pick_nbnxn_kernel(fp, cr, fr->hwinfo, fr->use_cpu_acceleration,
                                   NULL,
+                                  ir,
                                   &nbv->grp[i].kernel_type,
                                   &nbv->grp[i].ewald_excl,
                                   fr->bNonbonded);
@@ -1834,7 +1841,7 @@ static void init_nb_verlet(FILE *fp,
 
     for(i=0; i<nbv->ngrp; i++)
     {
-        if (nbv->grp[0].kernel_type == nbk8x8x8_CUDA)
+        if (nbv->grp[0].kernel_type == nbnxnk8x8x8_CUDA)
         {
             nb_alloc = &pmalloc;
             nb_free  = &pfree;
index aaa3f22c8eb678df6a779eac3708089e373293a4..a11f764914917d3c6019c117815c1567a7f14f95 100644 (file)
@@ -151,8 +151,8 @@ static void nbnxn_atomdata_output_init(nbnxn_atomdata_output_t *out,
     ma((void **)&out->Vvdw,out->nV*sizeof(*out->Vvdw));
     ma((void **)&out->Vc  ,out->nV*sizeof(*out->Vc  ));
 
-    if (nb_kernel_type == nbk4xN_X86_SIMD128 ||
-        nb_kernel_type == nbk4xN_X86_SIMD256)
+    if (nb_kernel_type == nbnxnk4xN_SIMD_4xN ||
+        nb_kernel_type == nbnxnk4xN_SIMD_2xNN)
     {
         cj_size = nbnxn_kernel_to_cj_size(nb_kernel_type);
         out->nVS = nenergrp*nenergrp*stride*(cj_size>>1)*cj_size;
@@ -598,17 +598,25 @@ void nbnxn_atomdata_init(FILE *fp,
     nbat->lj_comb = NULL;
     if (simple)
     {
+        int pack_x;
+
         switch (nb_kernel_type)
         {
-        case nbk4xN_X86_SIMD128:
-            nbat->XFormat = nbatX4;
-            break;
-        case nbk4xN_X86_SIMD256:
-#ifndef GMX_DOUBLE
-            nbat->XFormat = nbatX8;
-#else
-            nbat->XFormat = nbatX4;
-#endif
+        case nbnxnk4xN_SIMD_4xN:
+        case nbnxnk4xN_SIMD_2xNN:
+            pack_x = max(NBNXN_CPU_CLUSTER_I_SIZE,
+                         nbnxn_kernel_to_cj_size(nb_kernel_type));
+            switch (pack_x)
+            {
+                case 4:
+                    nbat->XFormat = nbatX4;
+                    break;
+                case 8:
+                    nbat->XFormat = nbatX8;
+                    break;
+                default:
+                    gmx_incons("Unsupported packing width");
+            }
             break;
         default:
             nbat->XFormat = nbatXYZ;
@@ -1034,14 +1042,14 @@ nbnxn_atomdata_reduce_reals_x86_simd(real * gmx_restrict dest,
 #else
 #define GMX_MM128_HERE
 #endif
-#include "gmx_x86_simd_macros.h"
+#include "gmx_simd_macros.h"
 
     int       i,s;
     gmx_mm_pr dest_SSE,src_SSE;
 
     if (bDestSet)
     {
-        for(i=i0; i<i1; i+=GMX_X86_SIMD_WIDTH_HERE)
+        for(i=i0; i<i1; i+=GMX_SIMD_WIDTH_HERE)
         {
             dest_SSE = gmx_load_pr(dest+i);
             for(s=0; s<nsrc; s++)
@@ -1054,7 +1062,7 @@ nbnxn_atomdata_reduce_reals_x86_simd(real * gmx_restrict dest,
     }
     else
     {
-        for(i=i0; i<i1; i+=GMX_X86_SIMD_WIDTH_HERE)
+        for(i=i0; i<i1; i+=GMX_SIMD_WIDTH_HERE)
         {
             dest_SSE = gmx_load_pr(src[0]+i);
             for(s=1; s<nsrc; s++)
index 9d0be66e30eac01cdc808ae482a2a19f146cee75..1a7f5bfcea634b6722f5851c9198f2b9f0a692d7 100644 (file)
@@ -95,29 +95,32 @@ typedef struct {
     int  nsubc_tot;      /* Total number of subcell, used for printing  */
 } nbnxn_grid_t;
 
-#ifdef NBNXN_SEARCH_SSE
+#ifdef GMX_NBNXN_SIMD
+#if GMX_NBNXN_SIMD_BITWIDTH == 128
 #define GMX_MM128_HERE
-#include "gmx_x86_simd_macros.h"
-typedef struct nbnxn_x_ci_x86_simd128 {
+#else
+#if GMX_NBNXN_SIMD_BITWIDTH == 256
+#define GMX_MM256_HERE
+#else
+#error "unsupported GMX_NBNXN_SIMD_BITWIDTH"
+#endif
+#endif
+#include "gmx_simd_macros.h"
+
+typedef struct nbnxn_x_ci_simd_4xn {
     /* The i-cluster coordinates for simple search */
     gmx_mm_pr ix_SSE0,iy_SSE0,iz_SSE0;
     gmx_mm_pr ix_SSE1,iy_SSE1,iz_SSE1;
     gmx_mm_pr ix_SSE2,iy_SSE2,iz_SSE2;
     gmx_mm_pr ix_SSE3,iy_SSE3,iz_SSE3;
-} nbnxn_x_ci_x86_simd128_t;
-#undef GMX_MM128_HERE
-#ifdef GMX_X86_AVX_256
-#define GMX_MM256_HERE
-#include "gmx_x86_simd_macros.h"
-typedef struct nbnxn_x_ci_x86_simd256 {
+} nbnxn_x_ci_simd_4xn_t;
+
+typedef struct nbnxn_x_ci_simd_2xnn {
     /* The i-cluster coordinates for simple search */
     gmx_mm_pr ix_SSE0,iy_SSE0,iz_SSE0;
-    gmx_mm_pr ix_SSE1,iy_SSE1,iz_SSE1;
     gmx_mm_pr ix_SSE2,iy_SSE2,iz_SSE2;
-    gmx_mm_pr ix_SSE3,iy_SSE3,iz_SSE3;
-} nbnxn_x_ci_x86_simd256_t;
-#undef GMX_MM256_HERE
-#endif
+} nbnxn_x_ci_simd_2xnn_t;
+
 #endif
 
 /* Working data for the actual i-supercell during pair search */
@@ -126,11 +129,9 @@ typedef struct nbnxn_list_work {
 
     float *bb_ci;      /* The bounding boxes, pbc shifted, for each cluster */
     real  *x_ci;       /* The coordinates, pbc shifted, for each atom       */
-#ifdef NBNXN_SEARCH_SSE
-    nbnxn_x_ci_x86_simd128_t *x_ci_x86_simd128;
-#ifdef GMX_X86_AVX_256
-    nbnxn_x_ci_x86_simd256_t *x_ci_x86_simd256;
-#endif
+#ifdef GMX_NBNXN_SIMD
+    nbnxn_x_ci_simd_4xn_t *x_ci_simd_4xn;
+    nbnxn_x_ci_simd_2xnn_t *x_ci_simd_2xnn;
 #endif
     int  cj_ind;       /* The current cj_ind index for the current list     */
     int  cj4_init;     /* The first unitialized cj4 block                   */
@@ -155,17 +156,18 @@ gmx_icell_set_x_t(int ci,
                   nbnxn_list_work_t *work);
 
 static gmx_icell_set_x_t icell_set_x_simple;
-#ifdef NBNXN_SEARCH_SSE
-static gmx_icell_set_x_t icell_set_x_simple_x86_simd128;
-#ifdef GMX_X86_AVX_256
-static gmx_icell_set_x_t icell_set_x_simple_x86_simd256;
-#endif
+#ifdef GMX_NBNXN_SIMD
+static gmx_icell_set_x_t icell_set_x_simple_simd_4xn;
+static gmx_icell_set_x_t icell_set_x_simple_simd_2xnn;
 #endif
 static gmx_icell_set_x_t icell_set_x_supersub;
 #ifdef NBNXN_SEARCH_SSE
 static gmx_icell_set_x_t icell_set_x_supersub_sse8;
 #endif
 
+#undef GMX_MM128_HERE
+#undef GMX_MM256_HERE
+
 /* Local cycle count struct for profiling */
 typedef struct {
     int          count;
similarity index 83%
rename from src/mdlib/nbnxn_kernels/nbnxn_kernel_x86_simd128.c
rename to src/mdlib/nbnxn_kernels/nbnxn_kernel_simd_2xnn.c
index 8018f65f5ea3b10948f7289e597a456f1e46d79a..0106d4d7801b1d716ae1199a0cea87c61b143a48 100644 (file)
 #include "../nbnxn_consts.h"
 #include "nbnxn_kernel_common.h"
 
-#ifdef GMX_X86_SSE2
+#ifdef GMX_NBNXN_SIMD_2XNN
 
-#include "nbnxn_kernel_x86_simd128.h"
+#include "nbnxn_kernel_simd_2xnn.h"
 
-/* Include all flavors of the 128-bit SSE or AVX kernel loops */
+/* Include all flavors of the SSE or AVX 2x(N+N) kernel loops */
 
+#if GMX_NBNXN_SIMD_BITWIDTH == 128
 #define GMX_MM128_HERE
+#else
+#if GMX_NBNXN_SIMD_BITWIDTH == 256
+#define GMX_MM256_HERE
+#else
+#error "unsupported GMX_NBNXN_SIMD_BITWIDTH"
+#endif
+#endif
 
 /* Analytical reaction-field kernels */
 #define CALC_COUL_RF
 
-#include "nbnxn_kernel_x86_simd_includes.h"
+#include "nbnxn_kernel_simd_2xnn_includes.h"
 
 #undef CALC_COUL_RF
 
 #define CALC_COUL_TAB
 
 /* Single cut-off: rcoulomb = rvdw */
-#include "nbnxn_kernel_x86_simd_includes.h"
+#include "nbnxn_kernel_simd_2xnn_includes.h"
 
 /* Twin cut-off: rcoulomb >= rvdw */
 #define VDW_CUTOFF_CHECK
-#include "nbnxn_kernel_x86_simd_includes.h"
+#include "nbnxn_kernel_simd_2xnn_includes.h"
 #undef VDW_CUTOFF_CHECK
 
 #undef CALC_COUL_TAB
 #define CALC_COUL_EWALD
 
 /* Single cut-off: rcoulomb = rvdw */
-#include "nbnxn_kernel_x86_simd_includes.h"
+#include "nbnxn_kernel_simd_2xnn_includes.h"
 
 /* Twin cut-off: rcoulomb >= rvdw */
 #define VDW_CUTOFF_CHECK
-#include "nbnxn_kernel_x86_simd_includes.h"
+#include "nbnxn_kernel_simd_2xnn_includes.h"
 #undef VDW_CUTOFF_CHECK
 
 #undef CALC_COUL_EWALD
@@ -109,7 +117,7 @@ typedef void (*p_nbk_func_noener)(const nbnxn_pairlist_t     *nbl,
 
 enum { coultRF, coultTAB, coultTAB_TWIN, coultEWALD, coultEWALD_TWIN, coultNR };
 
-#define NBK_FN(elec,ljcomb) nbnxn_kernel_x86_simd128_##elec##_comb_##ljcomb##_ener
+#define NBK_FN(elec,ljcomb) nbnxn_kernel_simd_2xnn_##elec##_comb_##ljcomb##_ener
 static p_nbk_func_ener p_nbk_ener[coultNR][ljcrNR] =
 { { NBK_FN(rf        ,geom), NBK_FN(rf        ,lb), NBK_FN(rf        ,none) },
   { NBK_FN(tab       ,geom), NBK_FN(tab       ,lb), NBK_FN(tab       ,none) },
@@ -118,7 +126,7 @@ static p_nbk_func_ener p_nbk_ener[coultNR][ljcrNR] =
   { NBK_FN(ewald_twin,geom), NBK_FN(ewald_twin,lb), NBK_FN(ewald_twin,none) } };
 #undef NBK_FN
 
-#define NBK_FN(elec,ljcomb) nbnxn_kernel_x86_simd128_##elec##_comb_##ljcomb##_energrp
+#define NBK_FN(elec,ljcomb) nbnxn_kernel_simd_2xnn_##elec##_comb_##ljcomb##_energrp
 static p_nbk_func_ener p_nbk_energrp[coultNR][ljcrNR] =
 { { NBK_FN(rf        ,geom), NBK_FN(rf        ,lb), NBK_FN(rf        ,none) },
   { NBK_FN(tab       ,geom), NBK_FN(tab       ,lb), NBK_FN(tab       ,none) },
@@ -127,7 +135,7 @@ static p_nbk_func_ener p_nbk_energrp[coultNR][ljcrNR] =
   { NBK_FN(ewald_twin,geom), NBK_FN(ewald_twin,lb), NBK_FN(ewald_twin,none) } };
 #undef NBK_FN
 
-#define NBK_FN(elec,ljcomb) nbnxn_kernel_x86_simd128_##elec##_comb_##ljcomb##_noener
+#define NBK_FN(elec,ljcomb) nbnxn_kernel_simd_2xnn_##elec##_comb_##ljcomb##_noener
 static p_nbk_func_noener p_nbk_noener[coultNR][ljcrNR] =
 { { NBK_FN(rf        ,geom), NBK_FN(rf        ,lb), NBK_FN(rf        ,none) },
   { NBK_FN(tab       ,geom), NBK_FN(tab       ,lb), NBK_FN(tab       ,none) },
@@ -141,15 +149,14 @@ static void reduce_group_energies(int ng,int ng_2log,
                                   const real *VSvdw,const real *VSc,
                                   real *Vvdw,real *Vc)
 {
+    const int simd_width   = GMX_SIMD_WIDTH_HERE;
+    const int unrollj_half = GMX_SIMD_WIDTH_HERE/4;
     int ng_p2,i,j,j0,j1,c,s;
 
-#define SIMD_WIDTH       (GMX_X86_SIMD_WIDTH_HERE)
-#define SIMD_WIDTH_HALF  (GMX_X86_SIMD_WIDTH_HERE/2)
-
     ng_p2 = (1<<ng_2log);
 
     /* The size of the x86 SIMD energy group buffer array is:
-     * ng*ng*ng_p2*SIMD_WIDTH_HALF*SIMD_WIDTH
+     * ng*ng*ng_p2*unrollj_half*simd_width
      */
     for(i=0; i<ng; i++)
     {
@@ -163,34 +170,34 @@ static void reduce_group_energies(int ng,int ng_2log,
         {
             for(j0=0; j0<ng; j0++)
             {
-                c = ((i*ng + j1)*ng_p2 + j0)*SIMD_WIDTH_HALF*SIMD_WIDTH;
-                for(s=0; s<SIMD_WIDTH_HALF; s++)
+                c = ((i*ng + j1)*ng_p2 + j0)*unrollj_half*simd_width/2;
+                for(s=0; s<unrollj_half; s++)
                 {
                     Vvdw[i*ng+j0] += VSvdw[c+0];
                     Vvdw[i*ng+j1] += VSvdw[c+1];
                     Vc  [i*ng+j0] += VSc  [c+0];
                     Vc  [i*ng+j1] += VSc  [c+1];
-                    c += SIMD_WIDTH + 2;
+                    c += simd_width/2 + 2;
                 }
             }
         }
     }
 }
 
-#endif /* GMX_X86_SSE2 */
+#endif /* GMX_NBNXN_SIMD_2XNN */
 
 void
-nbnxn_kernel_x86_simd128(nbnxn_pairlist_set_t       *nbl_list,
-                         const nbnxn_atomdata_t     *nbat,
-                         const interaction_const_t  *ic,
-                         int                        ewald_excl,
-                         rvec                       *shift_vec, 
-                         int                        force_flags,
-                         int                        clearF,
-                         real                       *fshift,
-                         real                       *Vc,
-                         real                       *Vvdw)
-#ifdef GMX_X86_SSE2
+nbnxn_kernel_simd_2xnn(nbnxn_pairlist_set_t       *nbl_list,
+                       const nbnxn_atomdata_t     *nbat,
+                       const interaction_const_t  *ic,
+                       int                        ewald_excl,
+                       rvec                       *shift_vec, 
+                       int                        force_flags,
+                       int                        clearF,
+                       real                       *fshift,
+                       real                       *Vc,
+                       real                       *Vvdw)
+#ifdef GMX_NBNXN_SIMD_2XNN
 {
     int              nnbl;
     nbnxn_pairlist_t **nbl;
@@ -320,6 +327,6 @@ nbnxn_kernel_x86_simd128(nbnxn_pairlist_set_t       *nbl_list,
 }
 #else
 {
-    gmx_incons("nbnxn_kernel_x86_simd128 called while GROMACS was configured without SSE enabled");
+    gmx_incons("nbnxn_kernel_simd_2xnn called while GROMACS was configured without 2x(N+N) SIMD kernels enabled");
 }
 #endif
similarity index 69%
rename from src/mdlib/nbnxn_kernels/nbnxn_kernel_x86_simd128.h
rename to src/mdlib/nbnxn_kernels/nbnxn_kernel_simd_2xnn.h
index 9dd757dd75d547498db490f8e2bf7da192920e2d..c3a6b3b6ee90a4ef70c3f824e7c483692df68272 100644 (file)
@@ -35,8 +35,8 @@
  * To help us fund GROMACS development, we humbly ask that you cite
  * the research papers on the package. Check out http://www.gromacs.org.
  */
-#ifndef _nbnxn_kernel_x86_simd128_h
-#define _nbnxn_kernel_x86_simd128_h
+#ifndef _nbnxn_kernel_simd_2xnn_h
+#define _nbnxn_kernel_simd_2xnn_h
 
 #include "typedefs.h"
 
 extern "C" {
 #endif
 
-/* Wrapper call for the non-bonded cluster vs cluster kernels */
+/* Wrapper call for the non-bonded cluster vs cluster kernels.
+ * These kernels determine 4xN cluster interactions for SIMD width 2*N
+ * by packing 2*N j-atom variables in SIMD registers.
+ */
 void
-nbnxn_kernel_x86_simd128(nbnxn_pairlist_set_t       *nbl_list,
-                         const nbnxn_atomdata_t     *nbat,
-                         const interaction_const_t  *ic,
-                         int                        ewald_excl,
-                         rvec                       *shift_vec,
-                         int                        force_flags,
-                         int                        clearF,
-                         real                       *fshift,
-                         real                       *Vc,
-                         real                       *Vvdw);
+nbnxn_kernel_simd_2xnn(nbnxn_pairlist_set_t       *nbl_list,
+                       const nbnxn_atomdata_t     *nbat,
+                       const interaction_const_t  *ic,
+                       int                        ewald_excl,
+                       rvec                       *shift_vec,
+                       int                        force_flags,
+                       int                        clearF,
+                       real                       *fshift,
+                       real                       *Vc,
+                       real                       *Vvdw);
 
 #ifdef __cplusplus
 }
diff --git a/src/mdlib/nbnxn_kernels/nbnxn_kernel_simd_2xnn_includes.h b/src/mdlib/nbnxn_kernels/nbnxn_kernel_simd_2xnn_includes.h
new file mode 100644 (file)
index 0000000..67e63b1
--- /dev/null
@@ -0,0 +1,74 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2009, The GROMACS Development Team
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+/* This files includes all x86 SIMD kernel flavors.
+ * Only the Electrostatics type and optionally the VdW cut-off check
+ * need to be set before including this file.
+ */
+
+/* Include the force+energy kernels */
+#define CALC_ENERGIES
+#define LJ_COMB_GEOM
+#include "nbnxn_kernel_simd_2xnn_outer.h"
+#undef LJ_COMB_GEOM
+#define LJ_COMB_LB
+#include "nbnxn_kernel_simd_2xnn_outer.h"
+#undef LJ_COMB_LB
+#include "nbnxn_kernel_simd_2xnn_outer.h"
+#undef CALC_ENERGIES
+
+/* Include the force+energygroups kernels */
+#define CALC_ENERGIES
+#define ENERGY_GROUPS
+#define LJ_COMB_GEOM
+#include "nbnxn_kernel_simd_2xnn_outer.h"
+#undef LJ_COMB_GEOM
+#define LJ_COMB_LB
+#include "nbnxn_kernel_simd_2xnn_outer.h"
+#undef LJ_COMB_LB
+#include "nbnxn_kernel_simd_2xnn_outer.h"
+#undef ENERGY_GROUPS
+#undef CALC_ENERGIES
+
+/* Include the force only kernels */
+#define LJ_COMB_GEOM
+#include "nbnxn_kernel_simd_2xnn_outer.h"
+#undef LJ_COMB_GEOM
+#define LJ_COMB_LB
+#include "nbnxn_kernel_simd_2xnn_outer.h"
+#undef LJ_COMB_LB
+#include "nbnxn_kernel_simd_2xnn_outer.h"
diff --git a/src/mdlib/nbnxn_kernels/nbnxn_kernel_simd_2xnn_inner.h b/src/mdlib/nbnxn_kernels/nbnxn_kernel_simd_2xnn_inner.h
new file mode 100644 (file)
index 0000000..cab66c3
--- /dev/null
@@ -0,0 +1,752 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2009, The GROMACS Development Team
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+/* This is the innermost loop contents for the 4 x N atom SIMD kernel.
+ * This flavor of the kernel duplicates the data for N j-particles in
+ * 2xN wide SIMD registers to do operate on 2 i-particles at once.
+ * This leads to 4/2=2 sets of most instructions. Therefore we call
+ * this kernel 2x(N+N) = 2xnn
+ *
+ * This 2xnn kernel is basically the 4xn equivalent with half the registers
+ * and instructions removed.
+ *
+ * An alternative would be to load to different cluster of N j-particles
+ * into SIMD registers, giving a 4x(N+N) kernel. This doubles the amount
+ * of instructions, which could lead to better scheduling. But we actually
+ * observed worse scheduling for the AVX-256 4x8 normal analytical PME
+ * kernel, which has a lower pair throughput than 2x(4+4) with gcc 4.7.
+ * It could be worth trying this option, but it takes some more effort.
+ * This 2xnn kernel is basically the 4xn equivalent with
+ */
+
+
+/* When calculating RF or Ewald interactions we calculate the electrostatic
+ * forces on excluded atom pairs here in the non-bonded loops.
+ * But when energies and/or virial is required we calculate them
+ * separately to as then it is easier to separate the energy and virial
+ * contributions.
+ */
+#if defined CHECK_EXCLS && defined CALC_COULOMB
+#define EXCL_FORCES
+#endif
+
+/* Without exclusions and energies we only need to mask the cut-off,
+ * this can be faster with blendv (only available with SSE4.1 and later).
+ */
+#if !(defined CHECK_EXCLS || defined CALC_ENERGIES) && defined GMX_X86_SSE4_1 && !defined COUNT_PAIRS
+/* With RF and tabulated Coulomb we replace cmp+and with sub+blendv.
+ * With gcc this is slower, except for RF on Sandy Bridge.
+ * Tested with gcc 4.6.2, 4.6.3 and 4.7.1.
+ */
+#if (defined CALC_COUL_RF || defined CALC_COUL_TAB) && (!defined __GNUC__ || (defined CALC_COUL_RF && defined GMX_X86_AVX_256))
+#define CUTOFF_BLENDV
+#endif
+/* With analytical Ewald we replace cmp+and+and with sub+blendv+blendv.
+ * This is only faster with icc on Sandy Bridge (PS kernel slower than gcc 4.7).
+ * Tested with icc 13.
+ */
+#if defined CALC_COUL_EWALD && defined __INTEL_COMPILER && defined GMX_X86_AVX_256
+#define CUTOFF_BLENDV
+#endif
+#endif
+
+        {
+            int        cj,aj,ajx,ajy,ajz;
+
+#ifdef ENERGY_GROUPS
+            /* Energy group indices for two atoms packed into one int */
+            int        egp_jj[UNROLLJ/2];
+#endif
+
+#ifdef CHECK_EXCLS
+            /* Interaction (non-exclusion) mask of all 1's or 0's */
+            gmx_mm_pr  int_SSE0;
+            gmx_mm_pr  int_SSE2;
+#endif
+
+            gmx_mm_pr  jxSSE,jySSE,jzSSE;
+            gmx_mm_pr  dx_SSE0,dy_SSE0,dz_SSE0;
+            gmx_mm_pr  dx_SSE2,dy_SSE2,dz_SSE2;
+            gmx_mm_pr  tx_SSE0,ty_SSE0,tz_SSE0;
+            gmx_mm_pr  tx_SSE2,ty_SSE2,tz_SSE2;
+            gmx_mm_pr  rsq_SSE0,rinv_SSE0,rinvsq_SSE0;
+            gmx_mm_pr  rsq_SSE2,rinv_SSE2,rinvsq_SSE2;
+#ifndef CUTOFF_BLENDV
+            /* wco: within cut-off, mask of all 1's or 0's */
+            gmx_mm_pr  wco_SSE0;
+            gmx_mm_pr  wco_SSE2;
+#endif
+#ifdef VDW_CUTOFF_CHECK
+            gmx_mm_pr  wco_vdw_SSE0;
+#ifndef HALF_LJ
+            gmx_mm_pr  wco_vdw_SSE2;
+#endif
+#endif
+#ifdef CALC_COULOMB
+#ifdef CHECK_EXCLS
+            /* 1/r masked with the interaction mask */
+            gmx_mm_pr  rinv_ex_SSE0;
+            gmx_mm_pr  rinv_ex_SSE2;
+#endif
+            gmx_mm_pr  jq_SSE;
+            gmx_mm_pr  qq_SSE0;
+            gmx_mm_pr  qq_SSE2;
+#ifdef CALC_COUL_TAB
+            /* The force (PME mesh force) we need to subtract from 1/r^2 */
+            gmx_mm_pr  fsub_SSE0;
+            gmx_mm_pr  fsub_SSE2;
+#endif
+#ifdef CALC_COUL_EWALD
+            gmx_mm_pr  brsq_SSE0,brsq_SSE2;
+            gmx_mm_pr  ewcorr_SSE0,ewcorr_SSE2;
+#endif
+
+            /* frcoul = (1/r - fsub)*r */
+            gmx_mm_pr  frcoul_SSE0;
+            gmx_mm_pr  frcoul_SSE2;
+#ifdef CALC_COUL_TAB
+            /* For tables: r, rs=r/sp, rf=floor(rs), frac=rs-rf */
+            gmx_mm_pr  r_SSE0,rs_SSE0,rf_SSE0,frac_SSE0;
+            gmx_mm_pr  r_SSE2,rs_SSE2,rf_SSE2,frac_SSE2;
+            /* Table index: rs truncated to an int */
+#if !(defined GMX_MM256_HERE && defined GMX_DOUBLE)
+            gmx_epi32  ti_SSE0,ti_SSE2;
+#else
+            __m128i    ti_SSE0,ti_SSE2;
+#endif
+            /* Linear force table values */
+            gmx_mm_pr  ctab0_SSE0,ctab1_SSE0;
+            gmx_mm_pr  ctab0_SSE2,ctab1_SSE2;
+#ifdef CALC_ENERGIES
+            /* Quadratic energy table value */
+            gmx_mm_pr  ctabv_SSE0;
+            gmx_mm_pr  ctabv_SSE2;
+#endif
+#endif
+#if defined CALC_ENERGIES && (defined CALC_COUL_EWALD || defined CALC_COUL_TAB)
+            /* The potential (PME mesh) we need to subtract from 1/r */
+            gmx_mm_pr  vc_sub_SSE0;
+            gmx_mm_pr  vc_sub_SSE2;
+#endif
+#ifdef CALC_ENERGIES
+            /* Electrostatic potential */
+            gmx_mm_pr  vcoul_SSE0;
+            gmx_mm_pr  vcoul_SSE2;
+#endif
+#endif
+            /* The force times 1/r */
+            gmx_mm_pr  fscal_SSE0;
+            gmx_mm_pr  fscal_SSE2;
+
+#ifdef CALC_LJ
+#ifdef LJ_COMB_LB
+            /* LJ sigma_j/2 and sqrt(epsilon_j) */
+            gmx_mm_pr  hsig_j_SSE,seps_j_SSE;
+            /* LJ sigma_ij and epsilon_ij */
+            gmx_mm_pr  sig_SSE0,eps_SSE0;
+#ifndef HALF_LJ
+            gmx_mm_pr  sig_SSE2,eps_SSE2;
+#endif
+#ifdef CALC_ENERGIES
+            gmx_mm_pr  sig2_SSE0,sig6_SSE0;
+#ifndef HALF_LJ
+            gmx_mm_pr  sig2_SSE2,sig6_SSE2;
+#endif
+#endif /* LJ_COMB_LB */
+#endif /* CALC_LJ */
+
+#ifdef LJ_COMB_GEOM
+            gmx_mm_pr  c6s_j_SSE,c12s_j_SSE;
+#endif
+
+#if defined LJ_COMB_GEOM || defined LJ_COMB_LB
+            /* Index for loading LJ parameters, complicated when interleaving */
+            int         aj2;
+#endif
+
+#ifndef FIX_LJ_C
+            /* LJ C6 and C12 parameters, used with geometric comb. rule */
+            gmx_mm_pr  c6_SSE0,c12_SSE0;
+#ifndef HALF_LJ
+            gmx_mm_pr  c6_SSE2,c12_SSE2;
+#endif
+#endif
+
+            /* Intermediate variables for LJ calculation */
+#ifndef LJ_COMB_LB
+            gmx_mm_pr  rinvsix_SSE0;
+#ifndef HALF_LJ
+            gmx_mm_pr  rinvsix_SSE2;
+#endif
+#endif
+#ifdef LJ_COMB_LB
+            gmx_mm_pr  sir_SSE0,sir2_SSE0,sir6_SSE0;
+#ifndef HALF_LJ
+            gmx_mm_pr  sir_SSE2,sir2_SSE2,sir6_SSE2;
+#endif
+#endif
+
+            gmx_mm_pr  FrLJ6_SSE0,FrLJ12_SSE0;
+#ifndef HALF_LJ
+            gmx_mm_pr  FrLJ6_SSE2,FrLJ12_SSE2;
+#endif
+#ifdef CALC_ENERGIES
+            gmx_mm_pr  VLJ6_SSE0,VLJ12_SSE0,VLJ_SSE0;
+#ifndef HALF_LJ
+            gmx_mm_pr  VLJ6_SSE2,VLJ12_SSE2,VLJ_SSE2;
+#endif
+#endif
+#endif /* CALC_LJ */
+
+            /* j-cluster index */
+            cj            = l_cj[cjind].cj;
+
+            /* Atom indices (of the first atom in the cluster) */
+            aj            = cj*UNROLLJ;
+#if defined CALC_LJ && (defined LJ_COMB_GEOM || defined LJ_COMB_LB)
+#if UNROLLJ == STRIDE
+            aj2           = aj*2;
+#else
+            aj2           = (cj>>1)*2*STRIDE + (cj & 1)*UNROLLJ;
+#endif
+#endif
+#if UNROLLJ == STRIDE
+            ajx           = aj*DIM;
+#else
+            ajx           = (cj>>1)*DIM*STRIDE + (cj & 1)*UNROLLJ;
+#endif
+            ajy           = ajx + STRIDE;
+            ajz           = ajy + STRIDE;
+
+#ifdef CHECK_EXCLS
+            {
+                /* Load integer interaction mask */
+                /* With AVX there are no integer operations, so cast to real */
+                gmx_mm_pr mask_pr = gmx_mm_castsi256_pr(_mm256_set1_epi32(l_cj[cjind].excl));
+                /* Intel Compiler version 12.1.3 20120130 is buggy: use cast.
+                 * With gcc we don't need the cast, but it's faster.
+                 */
+#define cast_cvt(x)  _mm256_cvtepi32_ps(_mm256_castps_si256(x))
+                int_SSE0  = gmx_cmpneq_pr(cast_cvt(gmx_and_pr(mask_pr,mask0)),zero_SSE);
+                int_SSE2  = gmx_cmpneq_pr(cast_cvt(gmx_and_pr(mask_pr,mask2)),zero_SSE);
+#undef cast_cvt
+            }
+#endif
+            /* load j atom coordinates */
+            jxSSE         = gmx_loaddh_pr(x+ajx);
+            jySSE         = gmx_loaddh_pr(x+ajy);
+            jzSSE         = gmx_loaddh_pr(x+ajz);
+
+            /* Calculate distance */
+            dx_SSE0       = gmx_sub_pr(ix_SSE0,jxSSE);
+            dy_SSE0       = gmx_sub_pr(iy_SSE0,jySSE);
+            dz_SSE0       = gmx_sub_pr(iz_SSE0,jzSSE);
+            dx_SSE2       = gmx_sub_pr(ix_SSE2,jxSSE);
+            dy_SSE2       = gmx_sub_pr(iy_SSE2,jySSE);
+            dz_SSE2       = gmx_sub_pr(iz_SSE2,jzSSE);
+
+            /* rsq = dx*dx+dy*dy+dz*dz */
+            rsq_SSE0      = gmx_calc_rsq_pr(dx_SSE0,dy_SSE0,dz_SSE0);
+            rsq_SSE2      = gmx_calc_rsq_pr(dx_SSE2,dy_SSE2,dz_SSE2);
+
+#ifndef CUTOFF_BLENDV
+            wco_SSE0      = gmx_cmplt_pr(rsq_SSE0,rc2_SSE);
+            wco_SSE2      = gmx_cmplt_pr(rsq_SSE2,rc2_SSE);
+#endif
+
+#ifdef CHECK_EXCLS
+#ifdef EXCL_FORCES
+            /* Only remove the (sub-)diagonal to avoid double counting */
+#if UNROLLJ == UNROLLI
+            if (cj == ci_sh)
+            {
+                wco_SSE0  = gmx_and_pr(wco_SSE0,diag_SSE0);
+                wco_SSE2  = gmx_and_pr(wco_SSE2,diag_SSE2);
+            }
+#else
+#error "only UNROLLJ == UNROLLI currently supported in the joined kernels"
+#endif
+#else /* EXCL_FORCES */
+            /* Remove all excluded atom pairs from the list */
+            wco_SSE0      = gmx_and_pr(wco_SSE0,int_SSE0);
+            wco_SSE2      = gmx_and_pr(wco_SSE2,int_SSE2);
+#endif
+#endif
+
+#ifdef COUNT_PAIRS
+            {
+                int i,j;
+                real tmp[UNROLLJ];
+                for(i=0; i<UNROLLI; i++)
+                {
+                    gmx_storeu_pr(tmp,i==0 ? wco_SSE0 : (i==1 ? wco_SSE1 : (i==2 ? wco_SSE2 : wco_SSE3)));
+                    for(j=0; j<UNROLLJ; j++)
+                    {
+                        if (!(tmp[j] == 0))
+                        {
+                            npair++;
+                        }
+                    }
+                }
+            }
+#endif
+
+#ifdef CHECK_EXCLS
+            /* For excluded pairs add a small number to avoid r^-6 = NaN */
+            rsq_SSE0      = gmx_add_pr(rsq_SSE0,gmx_andnot_pr(int_SSE0,avoid_sing_SSE));
+            rsq_SSE2      = gmx_add_pr(rsq_SSE2,gmx_andnot_pr(int_SSE2,avoid_sing_SSE));
+#endif
+
+            /* Calculate 1/r */
+            rinv_SSE0     = gmx_invsqrt_pr(rsq_SSE0);
+            rinv_SSE2     = gmx_invsqrt_pr(rsq_SSE2);
+
+#ifdef CALC_COULOMB
+            /* Load parameters for j atom */
+            jq_SSE        = gmx_loaddh_pr(q+aj);
+            qq_SSE0       = gmx_mul_pr(iq_SSE0,jq_SSE);
+            qq_SSE2       = gmx_mul_pr(iq_SSE2,jq_SSE);
+#endif
+
+#ifdef CALC_LJ
+
+#if !defined LJ_COMB_GEOM && !defined LJ_COMB_LB && !defined FIX_LJ_C
+            load_lj_pair_params2(nbfp0,type,aj,c6_SSE0,c12_SSE0);
+#ifndef HALF_LJ
+            load_lj_pair_params2(nbfp2,type,aj,c6_SSE2,c12_SSE2);
+#endif
+#endif /* not defined any LJ rule */
+
+#ifdef LJ_COMB_GEOM
+            c6s_j_SSE     = gmx_loaddh_pr(ljc+aj2+0);
+            c12s_j_SSE    = gmx_loaddh_pr(ljc+aj2+STRIDE);
+            c6_SSE0       = gmx_mul_pr(c6s_SSE0 ,c6s_j_SSE );
+#ifndef HALF_LJ
+            c6_SSE2       = gmx_mul_pr(c6s_SSE2 ,c6s_j_SSE );
+#endif
+            c12_SSE0      = gmx_mul_pr(c12s_SSE0,c12s_j_SSE);
+#ifndef HALF_LJ
+            c12_SSE2      = gmx_mul_pr(c12s_SSE2,c12s_j_SSE);
+#endif
+#endif /* LJ_COMB_GEOM */
+
+#ifdef LJ_COMB_LB
+            hsig_j_SSE    = gmx_loaddh_pr(ljc+aj2+0);
+            seps_j_SSE    = gmx_loaddh_pr(ljc+aj2+STRIDE);
+
+            sig_SSE0      = gmx_add_pr(hsig_i_SSE0,hsig_j_SSE);
+            eps_SSE0      = gmx_mul_pr(seps_i_SSE0,seps_j_SSE);
+#ifndef HALF_LJ
+            sig_SSE2      = gmx_add_pr(hsig_i_SSE2,hsig_j_SSE);
+            eps_SSE2      = gmx_mul_pr(seps_i_SSE2,seps_j_SSE);
+#endif
+#endif /* LJ_COMB_LB */
+
+#endif /* CALC_LJ */
+
+#ifndef CUTOFF_BLENDV
+            rinv_SSE0     = gmx_and_pr(rinv_SSE0,wco_SSE0);
+            rinv_SSE2     = gmx_and_pr(rinv_SSE2,wco_SSE2);
+#else
+            /* We only need to mask for the cut-off: blendv is faster */
+            rinv_SSE0     = gmx_blendv_pr(rinv_SSE0,zero_SSE,gmx_sub_pr(rc2_SSE,rsq_SSE0));
+            rinv_SSE2     = gmx_blendv_pr(rinv_SSE2,zero_SSE,gmx_sub_pr(rc2_SSE,rsq_SSE2));
+#endif
+
+            rinvsq_SSE0   = gmx_mul_pr(rinv_SSE0,rinv_SSE0);
+            rinvsq_SSE2   = gmx_mul_pr(rinv_SSE2,rinv_SSE2);
+
+#ifdef CALC_COULOMB
+            /* Note that here we calculate force*r, not the usual force/r.
+             * This allows avoiding masking the reaction-field contribution,
+             * as frcoul is later multiplied by rinvsq which has been
+             * masked with the cut-off check.
+             */
+
+#ifdef EXCL_FORCES
+            /* Only add 1/r for non-excluded atom pairs */
+            rinv_ex_SSE0  = gmx_and_pr(rinv_SSE0,int_SSE0);
+            rinv_ex_SSE2  = gmx_and_pr(rinv_SSE2,int_SSE2);
+#else
+            /* No exclusion forces, we always need 1/r */
+#define     rinv_ex_SSE0    rinv_SSE0
+#define     rinv_ex_SSE2    rinv_SSE2
+#endif
+
+#ifdef CALC_COUL_RF
+            /* Electrostatic interactions */
+            frcoul_SSE0   = gmx_mul_pr(qq_SSE0,gmx_add_pr(rinv_ex_SSE0,gmx_mul_pr(rsq_SSE0,mrc_3_SSE)));
+            frcoul_SSE2   = gmx_mul_pr(qq_SSE2,gmx_add_pr(rinv_ex_SSE2,gmx_mul_pr(rsq_SSE2,mrc_3_SSE)));
+
+#ifdef CALC_ENERGIES
+            vcoul_SSE0    = gmx_mul_pr(qq_SSE0,gmx_add_pr(rinv_ex_SSE0,gmx_add_pr(gmx_mul_pr(rsq_SSE0,hrc_3_SSE),moh_rc_SSE)));
+            vcoul_SSE2    = gmx_mul_pr(qq_SSE2,gmx_add_pr(rinv_ex_SSE2,gmx_add_pr(gmx_mul_pr(rsq_SSE2,hrc_3_SSE),moh_rc_SSE)));
+#endif
+#endif
+
+#ifdef CALC_COUL_EWALD
+            /* We need to mask (or limit) rsq for the cut-off,
+             * as large distances can cause an overflow in gmx_pmecorrF/V.
+             */
+#ifndef CUTOFF_BLENDV
+            brsq_SSE0     = gmx_mul_pr(beta2_SSE,gmx_and_pr(rsq_SSE0,wco_SSE0));
+            brsq_SSE2     = gmx_mul_pr(beta2_SSE,gmx_and_pr(rsq_SSE2,wco_SSE2));
+#else
+            /* Strangely, putting mul on a separate line is slower (icc 13) */
+            brsq_SSE0     = gmx_mul_pr(beta2_SSE,gmx_blendv_pr(rsq_SSE0,zero_SSE,gmx_sub_pr(rc2_SSE,rsq_SSE0)));
+            brsq_SSE2     = gmx_mul_pr(beta2_SSE,gmx_blendv_pr(rsq_SSE2,zero_SSE,gmx_sub_pr(rc2_SSE,rsq_SSE2)));
+#endif
+            ewcorr_SSE0   = gmx_mul_pr(gmx_pmecorrF_pr(brsq_SSE0),beta_SSE);
+            ewcorr_SSE2   = gmx_mul_pr(gmx_pmecorrF_pr(brsq_SSE2),beta_SSE);
+            frcoul_SSE0   = gmx_mul_pr(qq_SSE0,gmx_add_pr(rinv_ex_SSE0,gmx_mul_pr(ewcorr_SSE0,brsq_SSE0)));
+            frcoul_SSE2   = gmx_mul_pr(qq_SSE2,gmx_add_pr(rinv_ex_SSE2,gmx_mul_pr(ewcorr_SSE2,brsq_SSE2)));
+
+#ifdef CALC_ENERGIES
+            vc_sub_SSE0   = gmx_mul_pr(gmx_pmecorrV_pr(brsq_SSE0),beta_SSE);
+            vc_sub_SSE2   = gmx_mul_pr(gmx_pmecorrV_pr(brsq_SSE2),beta_SSE);
+#endif
+
+#endif /* CALC_COUL_EWALD */
+
+#ifdef CALC_COUL_TAB
+            /* Electrostatic interactions */
+            r_SSE0        = gmx_mul_pr(rsq_SSE0,rinv_SSE0);
+            r_SSE2        = gmx_mul_pr(rsq_SSE2,rinv_SSE2);
+            /* Convert r to scaled table units */
+            rs_SSE0       = gmx_mul_pr(r_SSE0,invtsp_SSE);
+            rs_SSE2       = gmx_mul_pr(r_SSE2,invtsp_SSE);
+            /* Truncate scaled r to an int */
+            ti_SSE0       = gmx_cvttpr_epi32(rs_SSE0);
+            ti_SSE2       = gmx_cvttpr_epi32(rs_SSE2);
+#ifdef GMX_X86_SSE4_1
+            /* SSE4.1 floor is faster than gmx_cvtepi32_ps int->float cast */
+            rf_SSE0       = gmx_floor_pr(rs_SSE0);
+            rf_SSE2       = gmx_floor_pr(rs_SSE2);
+#else
+            rf_SSE0       = gmx_cvtepi32_pr(ti_SSE0);
+            rf_SSE2       = gmx_cvtepi32_pr(ti_SSE2);
+#endif
+            frac_SSE0     = gmx_sub_pr(rs_SSE0,rf_SSE0);
+            frac_SSE2     = gmx_sub_pr(rs_SSE2,rf_SSE2);
+
+            /* Load and interpolate table forces and possibly energies.
+             * Force and energy can be combined in one table, stride 4: FDV0
+             * or in two separate tables with stride 1: F and V
+             * Currently single precision uses FDV0, double F and V.
+             */
+#ifndef CALC_ENERGIES
+            load_table_f(tab_coul_F,ti_SSE0,ti0,ctab0_SSE0,ctab1_SSE0);
+            load_table_f(tab_coul_F,ti_SSE2,ti2,ctab0_SSE2,ctab1_SSE2);
+#else
+#ifdef TAB_FDV0
+            load_table_f_v(tab_coul_F,ti_SSE0,ti0,ctab0_SSE0,ctab1_SSE0,ctabv_SSE0);
+            load_table_f_v(tab_coul_F,ti_SSE2,ti2,ctab0_SSE2,ctab1_SSE2,ctabv_SSE2);
+#else
+            load_table_f_v(tab_coul_F,tab_coul_V,ti_SSE0,ti0,ctab0_SSE0,ctab1_SSE0,ctabv_SSE0);
+            load_table_f_v(tab_coul_F,tab_coul_V,ti_SSE2,ti2,ctab0_SSE2,ctab1_SSE2,ctabv_SSE2);
+#endif
+#endif
+            fsub_SSE0     = gmx_add_pr(ctab0_SSE0,gmx_mul_pr(frac_SSE0,ctab1_SSE0));
+            fsub_SSE2     = gmx_add_pr(ctab0_SSE2,gmx_mul_pr(frac_SSE2,ctab1_SSE2));
+            frcoul_SSE0   = gmx_mul_pr(qq_SSE0,gmx_sub_pr(rinv_ex_SSE0,gmx_mul_pr(fsub_SSE0,r_SSE0)));
+            frcoul_SSE2   = gmx_mul_pr(qq_SSE2,gmx_sub_pr(rinv_ex_SSE2,gmx_mul_pr(fsub_SSE2,r_SSE2)));
+
+#ifdef CALC_ENERGIES
+            vc_sub_SSE0   = gmx_add_pr(ctabv_SSE0,gmx_mul_pr(gmx_mul_pr(mhalfsp_SSE,frac_SSE0),gmx_add_pr(ctab0_SSE0,fsub_SSE0)));
+            vc_sub_SSE2   = gmx_add_pr(ctabv_SSE2,gmx_mul_pr(gmx_mul_pr(mhalfsp_SSE,frac_SSE2),gmx_add_pr(ctab0_SSE2,fsub_SSE2)));
+#endif
+#endif /* CALC_COUL_TAB */
+
+#if defined CALC_ENERGIES && (defined CALC_COUL_EWALD || defined CALC_COUL_TAB)
+#ifndef NO_SHIFT_EWALD
+            /* Add Ewald potential shift to vc_sub for convenience */
+#ifdef CHECK_EXCLS
+            vc_sub_SSE0   = gmx_add_pr(vc_sub_SSE0,gmx_and_pr(sh_ewald_SSE,int_SSE0));
+            vc_sub_SSE2   = gmx_add_pr(vc_sub_SSE2,gmx_and_pr(sh_ewald_SSE,int_SSE2));
+#else
+            vc_sub_SSE0   = gmx_add_pr(vc_sub_SSE0,sh_ewald_SSE);
+            vc_sub_SSE2   = gmx_add_pr(vc_sub_SSE2,sh_ewald_SSE);
+#endif
+#endif
+            
+            vcoul_SSE0    = gmx_mul_pr(qq_SSE0,gmx_sub_pr(rinv_ex_SSE0,vc_sub_SSE0));
+            vcoul_SSE2    = gmx_mul_pr(qq_SSE2,gmx_sub_pr(rinv_ex_SSE2,vc_sub_SSE2));
+#endif
+
+#ifdef CALC_ENERGIES
+            /* Mask energy for cut-off and diagonal */
+            vcoul_SSE0    = gmx_and_pr(vcoul_SSE0,wco_SSE0);
+            vcoul_SSE2    = gmx_and_pr(vcoul_SSE2,wco_SSE2);
+#endif
+
+#endif /* CALC_COULOMB */
+
+#ifdef CALC_LJ
+            /* Lennard-Jones interaction */
+
+#ifdef VDW_CUTOFF_CHECK
+            wco_vdw_SSE0  = gmx_cmplt_pr(rsq_SSE0,rcvdw2_SSE);
+#ifndef HALF_LJ
+            wco_vdw_SSE2  = gmx_cmplt_pr(rsq_SSE2,rcvdw2_SSE);
+#endif
+#else
+            /* Same cut-off for Coulomb and VdW, reuse the registers */
+#define     wco_vdw_SSE0    wco_SSE0
+#define     wco_vdw_SSE2    wco_SSE2
+#endif
+
+#ifndef LJ_COMB_LB
+            rinvsix_SSE0  = gmx_mul_pr(rinvsq_SSE0,gmx_mul_pr(rinvsq_SSE0,rinvsq_SSE0));
+#ifdef EXCL_FORCES
+            rinvsix_SSE0  = gmx_and_pr(rinvsix_SSE0,int_SSE0);
+#endif
+#ifndef HALF_LJ
+            rinvsix_SSE2  = gmx_mul_pr(rinvsq_SSE2,gmx_mul_pr(rinvsq_SSE2,rinvsq_SSE2));
+#ifdef EXCL_FORCES
+            rinvsix_SSE2  = gmx_and_pr(rinvsix_SSE2,int_SSE2);
+#endif
+#endif
+#ifdef VDW_CUTOFF_CHECK
+            rinvsix_SSE0  = gmx_and_pr(rinvsix_SSE0,wco_vdw_SSE0);
+#ifndef HALF_LJ
+            rinvsix_SSE2  = gmx_and_pr(rinvsix_SSE2,wco_vdw_SSE2);
+#endif
+#endif
+            FrLJ6_SSE0    = gmx_mul_pr(c6_SSE0,rinvsix_SSE0);
+#ifndef HALF_LJ
+            FrLJ6_SSE2    = gmx_mul_pr(c6_SSE2,rinvsix_SSE2);
+#endif
+            FrLJ12_SSE0   = gmx_mul_pr(c12_SSE0,gmx_mul_pr(rinvsix_SSE0,rinvsix_SSE0));
+#ifndef HALF_LJ
+            FrLJ12_SSE2   = gmx_mul_pr(c12_SSE2,gmx_mul_pr(rinvsix_SSE2,rinvsix_SSE2));
+#endif
+#endif /* not LJ_COMB_LB */
+
+#ifdef LJ_COMB_LB
+            sir_SSE0      = gmx_mul_pr(sig_SSE0,rinv_SSE0);
+#ifndef HALF_LJ
+            sir_SSE2      = gmx_mul_pr(sig_SSE2,rinv_SSE2);
+#endif
+            sir2_SSE0     = gmx_mul_pr(sir_SSE0,sir_SSE0);
+#ifndef HALF_LJ
+            sir2_SSE2     = gmx_mul_pr(sir_SSE2,sir_SSE2);
+#endif
+            sir6_SSE0     = gmx_mul_pr(sir2_SSE0,gmx_mul_pr(sir2_SSE0,sir2_SSE0));
+#ifdef EXCL_FORCES
+            sir6_SSE0     = gmx_and_pr(sir6_SSE0,int_SSE0);
+#endif
+#ifndef HALF_LJ
+            sir6_SSE2     = gmx_mul_pr(sir2_SSE2,gmx_mul_pr(sir2_SSE2,sir2_SSE2));
+#ifdef EXCL_FORCES
+            sir6_SSE2     = gmx_and_pr(sir6_SSE2,int_SSE2);
+#endif
+#endif
+#ifdef VDW_CUTOFF_CHECK
+            sir6_SSE0     = gmx_and_pr(sir6_SSE0,wco_vdw_SSE0);
+#ifndef HALF_LJ
+            sir6_SSE2     = gmx_and_pr(sir6_SSE2,wco_vdw_SSE2);
+#endif
+#endif
+            FrLJ6_SSE0    = gmx_mul_pr(eps_SSE0,sir6_SSE0);
+#ifndef HALF_LJ
+            FrLJ6_SSE2    = gmx_mul_pr(eps_SSE2,sir6_SSE2);
+#endif
+            FrLJ12_SSE0   = gmx_mul_pr(FrLJ6_SSE0,sir6_SSE0);
+#ifndef HALF_LJ
+            FrLJ12_SSE2   = gmx_mul_pr(FrLJ6_SSE2,sir6_SSE2);
+#endif
+#if defined CALC_ENERGIES
+            /* We need C6 and C12 to calculate the LJ potential shift */
+            sig2_SSE0     = gmx_mul_pr(sig_SSE0,sig_SSE0);
+#ifndef HALF_LJ
+            sig2_SSE2     = gmx_mul_pr(sig_SSE2,sig_SSE2);
+#endif
+            sig6_SSE0     = gmx_mul_pr(sig2_SSE0,gmx_mul_pr(sig2_SSE0,sig2_SSE0));
+#ifndef HALF_LJ
+            sig6_SSE2     = gmx_mul_pr(sig2_SSE2,gmx_mul_pr(sig2_SSE2,sig2_SSE2));
+#endif
+            c6_SSE0       = gmx_mul_pr(eps_SSE0,sig6_SSE0);
+#ifndef HALF_LJ
+            c6_SSE2       = gmx_mul_pr(eps_SSE2,sig6_SSE2);
+#endif
+            c12_SSE0      = gmx_mul_pr(c6_SSE0,sig6_SSE0);
+#ifndef HALF_LJ
+            c12_SSE2      = gmx_mul_pr(c6_SSE2,sig6_SSE2);
+#endif
+#endif
+#endif /* LJ_COMB_LB */
+
+#endif /* CALC_LJ */
+            
+#ifdef CALC_ENERGIES
+#ifdef ENERGY_GROUPS
+            /* Extract the group pair index per j pair.
+             * Energy groups are stored per i-cluster, so things get
+             * complicated when the i- and j-cluster size don't match.
+             */
+            {
+                int egps_j;
+#if UNROLLJ == 2
+                egps_j    = nbat->energrp[cj>>1];
+                egp_jj[0] = ((egps_j >> ((cj & 1)*egps_jshift)) & egps_jmask)*egps_jstride;
+#else
+                /* We assume UNROLLI <= UNROLLJ */
+                int jdi;
+                for(jdi=0; jdi<UNROLLJ/UNROLLI; jdi++)
+                {
+                    int jj;
+                    egps_j = nbat->energrp[cj*(UNROLLJ/UNROLLI)+jdi];
+                    for(jj=0; jj<(UNROLLI/2); jj++)
+                    {
+                        egp_jj[jdi*(UNROLLI/2)+jj] = ((egps_j >> (jj*egps_jshift)) & egps_jmask)*egps_jstride;
+                    }
+                }
+#endif
+            }
+#endif
+
+#ifdef CALC_COULOMB
+#ifndef ENERGY_GROUPS
+            vctotSSE      = gmx_add_pr(vctotSSE, gmx_add_pr(vcoul_SSE0,vcoul_SSE2));
+#else
+            add_ener_grp_halves(vcoul_SSE0,vctp[0],vctp[1],egp_jj);
+            add_ener_grp_halves(vcoul_SSE2,vctp[2],vctp[3],egp_jj);
+#endif
+#endif
+
+#ifdef CALC_LJ
+            /* Calculate the LJ energies */
+            VLJ6_SSE0     = gmx_mul_pr(sixthSSE,gmx_sub_pr(FrLJ6_SSE0,gmx_mul_pr(c6_SSE0,sh_invrc6_SSE)));
+#ifndef HALF_LJ
+            VLJ6_SSE2     = gmx_mul_pr(sixthSSE,gmx_sub_pr(FrLJ6_SSE2,gmx_mul_pr(c6_SSE2,sh_invrc6_SSE)));
+#endif
+            VLJ12_SSE0    = gmx_mul_pr(twelvethSSE,gmx_sub_pr(FrLJ12_SSE0,gmx_mul_pr(c12_SSE0,sh_invrc12_SSE)));
+#ifndef HALF_LJ
+            VLJ12_SSE2    = gmx_mul_pr(twelvethSSE,gmx_sub_pr(FrLJ12_SSE2,gmx_mul_pr(c12_SSE2,sh_invrc12_SSE)));
+#endif
+
+            VLJ_SSE0      = gmx_sub_pr(VLJ12_SSE0,VLJ6_SSE0);
+#ifndef HALF_LJ
+            VLJ_SSE2      = gmx_sub_pr(VLJ12_SSE2,VLJ6_SSE2);
+#endif
+            /* The potential shift should be removed for pairs beyond cut-off */
+            VLJ_SSE0      = gmx_and_pr(VLJ_SSE0,wco_vdw_SSE0);
+#ifndef HALF_LJ
+            VLJ_SSE2      = gmx_and_pr(VLJ_SSE2,wco_vdw_SSE2);
+#endif
+#ifdef CHECK_EXCLS
+            /* The potential shift should be removed for excluded pairs */
+            VLJ_SSE0      = gmx_and_pr(VLJ_SSE0,int_SSE0);
+#ifndef HALF_LJ
+            VLJ_SSE2      = gmx_and_pr(VLJ_SSE2,int_SSE2);
+#endif
+#endif
+#ifndef ENERGY_GROUPS
+            VvdwtotSSE    = gmx_add_pr(VvdwtotSSE,
+#ifndef HALF_LJ
+                                       gmx_add_pr(VLJ_SSE0,VLJ_SSE2)
+#else
+                                       VLJ_SSE0
+#endif
+                                      );
+#else
+            add_ener_grp_halves(VLJ_SSE0,vvdwtp[0],vvdwtp[1],egp_jj);
+#ifndef HALF_LJ
+            add_ener_grp_halves(VLJ_SSE2,vvdwtp[2],vvdwtp[3],egp_jj);
+#endif
+#endif
+#endif /* CALC_LJ */
+#endif /* CALC_ENERGIES */
+
+#ifdef CALC_LJ
+            fscal_SSE0    = gmx_mul_pr(rinvsq_SSE0,
+#ifdef CALC_COULOMB
+                                                   gmx_add_pr(frcoul_SSE0,
+#else
+                                                   (
+#endif
+                                                    gmx_sub_pr(FrLJ12_SSE0,FrLJ6_SSE0)));
+#else
+            fscal_SSE0    = gmx_mul_pr(rinvsq_SSE0,frcoul_SSE0);
+#endif /* CALC_LJ */
+#if defined CALC_LJ && !defined HALF_LJ
+            fscal_SSE2    = gmx_mul_pr(rinvsq_SSE2,
+#ifdef CALC_COULOMB
+                                                   gmx_add_pr(frcoul_SSE2,
+#else
+                                                   (
+#endif
+                                                    gmx_sub_pr(FrLJ12_SSE2,FrLJ6_SSE2)));
+#else
+            /* Atom 2 and 3 don't have LJ, so only add Coulomb forces */
+            fscal_SSE2    = gmx_mul_pr(rinvsq_SSE2,frcoul_SSE2);
+#endif
+
+            /* Calculate temporary vectorial force */
+            tx_SSE0       = gmx_mul_pr(fscal_SSE0,dx_SSE0);
+            tx_SSE2       = gmx_mul_pr(fscal_SSE2,dx_SSE2);
+            ty_SSE0       = gmx_mul_pr(fscal_SSE0,dy_SSE0);
+            ty_SSE2       = gmx_mul_pr(fscal_SSE2,dy_SSE2);
+            tz_SSE0       = gmx_mul_pr(fscal_SSE0,dz_SSE0);
+            tz_SSE2       = gmx_mul_pr(fscal_SSE2,dz_SSE2);
+
+            /* Increment i atom force */
+            fix_SSE0      = gmx_add_pr(fix_SSE0,tx_SSE0);
+            fix_SSE2      = gmx_add_pr(fix_SSE2,tx_SSE2);
+            fiy_SSE0      = gmx_add_pr(fiy_SSE0,ty_SSE0);
+            fiy_SSE2      = gmx_add_pr(fiy_SSE2,ty_SSE2);
+            fiz_SSE0      = gmx_add_pr(fiz_SSE0,tz_SSE0);
+            fiz_SSE2      = gmx_add_pr(fiz_SSE2,tz_SSE2);
+
+            /* Decrement j atom force */
+            gmx_store_hpr(f+ajx,
+                         gmx_sub_hpr( gmx_load_hpr(f+ajx), gmx_sum4_hpr(tx_SSE0,tx_SSE2) ));
+            gmx_store_hpr(f+ajy,
+                         gmx_sub_hpr( gmx_load_hpr(f+ajy), gmx_sum4_hpr(ty_SSE0,ty_SSE2) ));
+            gmx_store_hpr(f+ajz,
+                         gmx_sub_hpr( gmx_load_hpr(f+ajz), gmx_sum4_hpr(tz_SSE0,tz_SSE2) ));
+        }
+
+#undef  rinv_ex_SSE0
+#undef  rinv_ex_SSE2
+
+#undef  wco_vdw_SSE0
+#undef  wco_vdw_SSE2
+
+#undef  CUTOFF_BLENDV
+
+#undef  EXCL_FORCES
diff --git a/src/mdlib/nbnxn_kernels/nbnxn_kernel_simd_2xnn_outer.h b/src/mdlib/nbnxn_kernels/nbnxn_kernel_simd_2xnn_outer.h
new file mode 100644 (file)
index 0000000..faa445e
--- /dev/null
@@ -0,0 +1,707 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2009, The GROMACS Development Team
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+/* GMX_MM128_HERE or GMX_MM256_HERE should be set before including this file */
+#include "gmx_simd_macros.h"
+
+#define SUM_SIMD4(x) (x[0]+x[1]+x[2]+x[3])
+
+#define UNROLLI    NBNXN_CPU_CLUSTER_I_SIZE
+#define UNROLLJ    (GMX_SIMD_WIDTH_HERE/2)
+
+#if defined GMX_MM128_HERE || defined GMX_DOUBLE
+#define STRIDE     4
+#endif
+#if defined GMX_MM256_HERE && !defined GMX_DOUBLE
+#define STRIDE     4
+#endif 
+
+#ifdef GMX_MM128_HERE
+#ifndef GMX_DOUBLE
+/* SSE single precision 4x4 kernel */
+#define SUM_SIMD(x) SUM_SIMD4(x)
+#define TAB_FDV0
+#else
+/* SSE double precision 4x2 kernel */
+#define SUM_SIMD(x) (x[0]+x[1])
+#endif
+#endif
+
+#ifdef GMX_MM256_HERE
+#ifndef GMX_DOUBLE
+/* AVX single precision 4x8 kernel */
+#define SUM_SIMD(x) (x[0]+x[1]+x[2]+x[3]+x[4]+x[5]+x[6]+x[7])
+#define TAB_FDV0
+#else
+/* AVX double precision 4x4 kernel */
+#define SUM_SIMD(x) SUM_SIMD4(x)
+#endif
+#endif
+
+#define SIMD_MASK_ALL   0xffffffff
+
+#include "nbnxn_kernel_simd_utils.h"
+
+/* All functionality defines are set here, except for:
+ * CALC_ENERGIES, ENERGY_GROUPS which are defined before.
+ * CHECK_EXCLS, which is set just before including the inner loop contents.
+ * The combination rule defines, LJ_COMB_GEOM or LJ_COMB_LB are currently
+ * set before calling the kernel function. We might want to move that
+ * to inside the n-loop and have a different combination rule for different
+ * ci's, as no combination rule gives a 50% performance hit for LJ.
+ */
+
+/* We always calculate shift forces, because it's cheap anyhow */
+#define CALC_SHIFTFORCES
+
+/* Assumes all LJ parameters are identical */
+/* #define FIX_LJ_C */
+
+/* The NBK_FUNC_NAME... macros below generate the whole zoo of kernels names
+ * with all combinations off electrostatics (coul), LJ combination rules (ljc)
+ * and energy calculations (ene), depending on the defines set.
+ */
+
+#define NBK_FUNC_NAME_C_LJC(base,coul,ljc,ene) base##_##coul##_comb_##ljc##_##ene
+
+#if defined LJ_COMB_GEOM
+#define NBK_FUNC_NAME_C(base,coul,ene) NBK_FUNC_NAME_C_LJC(base,coul,geom,ene)
+#else
+#if defined LJ_COMB_LB
+#define NBK_FUNC_NAME_C(base,coul,ene) NBK_FUNC_NAME_C_LJC(base,coul,lb,ene)
+#else
+#define NBK_FUNC_NAME_C(base,coul,ene) NBK_FUNC_NAME_C_LJC(base,coul,none,ene)
+#endif
+#endif
+
+#ifdef CALC_COUL_RF
+#define NBK_FUNC_NAME(base,ene) NBK_FUNC_NAME_C(base,rf,ene)
+#endif
+#ifdef CALC_COUL_TAB
+#ifndef VDW_CUTOFF_CHECK
+#define NBK_FUNC_NAME(base,ene) NBK_FUNC_NAME_C(base,tab,ene)
+#else
+#define NBK_FUNC_NAME(base,ene) NBK_FUNC_NAME_C(base,tab_twin,ene)
+#endif
+#endif
+#ifdef CALC_COUL_EWALD
+#ifndef VDW_CUTOFF_CHECK
+#define NBK_FUNC_NAME(base,ene) NBK_FUNC_NAME_C(base,ewald,ene)
+#else
+#define NBK_FUNC_NAME(base,ene) NBK_FUNC_NAME_C(base,ewald_twin,ene)
+#endif
+#endif
+
+static void
+#ifndef CALC_ENERGIES
+NBK_FUNC_NAME(nbnxn_kernel_simd_2xnn,noener)
+#else
+#ifndef ENERGY_GROUPS
+NBK_FUNC_NAME(nbnxn_kernel_simd_2xnn,ener)
+#else
+NBK_FUNC_NAME(nbnxn_kernel_simd_2xnn,energrp)
+#endif
+#endif
+#undef NBK_FUNC_NAME
+#undef NBK_FUNC_NAME_C
+#undef NBK_FUNC_NAME_C_LJC
+                            (const nbnxn_pairlist_t     *nbl,
+                             const nbnxn_atomdata_t     *nbat,
+                             const interaction_const_t  *ic,
+                             rvec                       *shift_vec, 
+                             real                       *f
+#ifdef CALC_SHIFTFORCES
+                             ,
+                             real                       *fshift
+#endif
+#ifdef CALC_ENERGIES
+                             ,
+                             real                       *Vvdw,
+                             real                       *Vc
+#endif
+                            )
+{
+    const nbnxn_ci_t   *nbln;
+    const nbnxn_cj_t   *l_cj;
+    const int          *type;
+    const real         *q;
+    const real         *shiftvec;
+    const real         *x;
+    const real         *nbfp0,*nbfp1,*nbfp2=NULL,*nbfp3=NULL;
+    real       facel;
+    real       *nbfp_ptr;
+    int        nbfp_stride;
+    int        n,ci,ci_sh;
+    int        ish,ish3;
+    gmx_bool   half_LJ,do_coul;
+    int        sci,scix,sciy,sciz,sci2;
+    int        cjind0,cjind1,cjind;
+    int        ip,jp;
+
+#ifdef ENERGY_GROUPS
+    int        Vstride_i;
+    int        egps_ishift,egps_imask;
+    int        egps_jshift,egps_jmask,egps_jstride;
+    int        egps_i;
+    real       *vvdwtp[UNROLLI];
+    real       *vctp[UNROLLI];
+#endif
+    
+    gmx_mm_pr  shX_SSE;
+    gmx_mm_pr  shY_SSE;
+    gmx_mm_pr  shZ_SSE;
+    gmx_mm_pr  ix_SSE0,iy_SSE0,iz_SSE0;
+    gmx_mm_pr  ix_SSE2,iy_SSE2,iz_SSE2;
+    gmx_mm_pr  fix_SSE0,fiy_SSE0,fiz_SSE0;
+    gmx_mm_pr  fix_SSE2,fiy_SSE2,fiz_SSE2;
+#if UNROLLJ >= 4
+#ifndef GMX_DOUBLE
+    __m128     fix_SSE,fiy_SSE,fiz_SSE;
+#else
+    __m256d    fix_SSE,fiy_SSE,fiz_SSE;
+#endif
+#else
+    __m128d    fix0_SSE,fiy0_SSE,fiz0_SSE;
+    __m128d    fix2_SSE,fiy2_SSE,fiz2_SSE;
+#endif
+
+    /* AVX: use floating point masks, as there are no integer instructions */
+    gmx_mm_pr  mask0 = _mm256_castsi256_ps(_mm256_set_epi32( 0x0080, 0x0040, 0x0020, 0x0010, 0x0008, 0x0004, 0x0002, 0x0001 ));
+    gmx_mm_pr  mask2 = _mm256_castsi256_ps(_mm256_set_epi32( 0x8000, 0x4000, 0x2000, 0x1000, 0x0800, 0x0400, 0x0200, 0x0100 ));
+
+    gmx_mm_pr  diag_SSE0 = _mm256_castsi256_ps( _mm256_set_epi32( 0xffffffff, 0xffffffff, 0x00000000, 0x00000000, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000 ));
+    gmx_mm_pr  diag_SSE2 = _mm256_castsi256_ps( _mm256_set_epi32( 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0xffffffff, 0x00000000, 0x00000000, 0x00000000 ));
+
+#ifndef GMX_MM256_HERE
+    __m128i    zeroi_SSE = _mm_setzero_si128();
+#endif
+#ifdef GMX_X86_SSE4_1
+    gmx_mm_pr  zero_SSE = gmx_set1_pr(0);
+#endif
+
+    gmx_mm_pr  one_SSE=gmx_set1_pr(1.0);
+    gmx_mm_pr  iq_SSE0=gmx_setzero_pr();
+    gmx_mm_pr  iq_SSE2=gmx_setzero_pr();
+    gmx_mm_pr  mrc_3_SSE;
+#ifdef CALC_ENERGIES
+    gmx_mm_pr  hrc_3_SSE,moh_rc_SSE;
+#endif
+
+#ifdef CALC_COUL_TAB
+    /* Coulomb table variables */
+    gmx_mm_pr  invtsp_SSE;
+    const real *tab_coul_F;
+#ifndef TAB_FDV0
+    const real *tab_coul_V;
+#endif
+#ifdef GMX_MM256_HERE
+    int        ti0_array[2*UNROLLJ-1],*ti0;
+    int        ti2_array[2*UNROLLJ-1],*ti2;
+#endif
+#ifdef CALC_ENERGIES
+    gmx_mm_pr  mhalfsp_SSE;
+#endif
+#endif
+
+#ifdef CALC_COUL_EWALD
+    gmx_mm_pr beta2_SSE,beta_SSE;
+#endif
+
+#if defined CALC_ENERGIES && (defined CALC_COUL_EWALD || defined CALC_COUL_TAB)
+    gmx_mm_pr  sh_ewald_SSE;
+#endif
+
+#ifdef LJ_COMB_LB
+    const real *ljc;
+
+    gmx_mm_pr  hsig_i_SSE0,seps_i_SSE0;
+    gmx_mm_pr  hsig_i_SSE2,seps_i_SSE2;
+#else
+#ifdef FIX_LJ_C
+    real       pvdw_array[2*UNROLLI*UNROLLJ+3];
+    real       *pvdw_c6,*pvdw_c12;
+    gmx_mm_pr  c6_SSE0,c12_SSE0;
+    gmx_mm_pr  c6_SSE2,c12_SSE2;
+#endif
+
+#ifdef LJ_COMB_GEOM
+    const real *ljc;
+
+    gmx_mm_pr  c6s_SSE0,c12s_SSE0;
+    gmx_mm_pr  c6s_SSE1,c12s_SSE1;
+    gmx_mm_pr  c6s_SSE2=gmx_setzero_pr(),c12s_SSE2=gmx_setzero_pr();
+    gmx_mm_pr  c6s_SSE3=gmx_setzero_pr(),c12s_SSE3=gmx_setzero_pr();
+#endif
+#endif /* LJ_COMB_LB */
+
+    gmx_mm_pr  vctotSSE,VvdwtotSSE;
+    gmx_mm_pr  sixthSSE,twelvethSSE;
+
+    gmx_mm_pr  avoid_sing_SSE;
+    gmx_mm_pr  rc2_SSE;
+#ifdef VDW_CUTOFF_CHECK
+    gmx_mm_pr  rcvdw2_SSE;
+#endif
+
+#ifdef CALC_ENERGIES
+    gmx_mm_pr  sh_invrc6_SSE,sh_invrc12_SSE;
+
+    /* cppcheck-suppress unassignedVariable */
+    real       tmpsum_array[15],*tmpsum;
+#endif
+#ifdef CALC_SHIFTFORCES
+    /* cppcheck-suppress unassignedVariable */
+    real       shf_array[15],*shf;
+#endif
+
+    int ninner;
+
+#ifdef COUNT_PAIRS
+    int npair=0;
+#endif
+
+#if defined LJ_COMB_GEOM || defined LJ_COMB_LB
+    ljc = nbat->lj_comb;
+#else
+    /* No combination rule used */
+#ifndef GMX_DOUBLE
+    nbfp_ptr    = nbat->nbfp_s4;
+#define NBFP_STRIDE  4
+#else
+    nbfp_ptr    = nbat->nbfp;
+#define NBFP_STRIDE  2
+#endif
+    nbfp_stride = NBFP_STRIDE;
+#endif
+
+#ifdef CALC_COUL_TAB
+#ifdef GMX_MM256_HERE
+    /* Generate aligned table pointers */
+    ti0 = (int *)(((size_t)(ti0_array+UNROLLJ-1)) & (~((size_t)(UNROLLJ*sizeof(real)-1))));
+    ti2 = (int *)(((size_t)(ti2_array+UNROLLJ-1)) & (~((size_t)(UNROLLJ*sizeof(real)-1))));
+#endif
+
+    invtsp_SSE  = gmx_set1_pr(ic->tabq_scale);
+#ifdef CALC_ENERGIES
+    mhalfsp_SSE = gmx_set1_pr(-0.5/ic->tabq_scale);
+#endif
+
+#ifdef TAB_FDV0
+    tab_coul_F = ic->tabq_coul_FDV0;
+#else
+    tab_coul_F = ic->tabq_coul_F;
+    tab_coul_V = ic->tabq_coul_V;
+#endif
+#endif /* CALC_COUL_TAB */
+
+#ifdef CALC_COUL_EWALD
+    beta2_SSE = gmx_set1_pr(ic->ewaldcoeff*ic->ewaldcoeff);
+    beta_SSE  = gmx_set1_pr(ic->ewaldcoeff);
+#endif
+
+#if (defined CALC_COUL_TAB || defined CALC_COUL_EWALD) && defined CALC_ENERGIES
+    sh_ewald_SSE = gmx_set1_pr(ic->sh_ewald);
+#endif
+
+    q                   = nbat->q;
+    type                = nbat->type;
+    facel               = ic->epsfac;
+    shiftvec            = shift_vec[0];
+    x                   = nbat->x;
+
+    avoid_sing_SSE = gmx_set1_pr(NBNXN_AVOID_SING_R2_INC);
+
+    /* The kernel either supports rcoulomb = rvdw or rcoulomb >= rvdw */
+    rc2_SSE    = gmx_set1_pr(ic->rcoulomb*ic->rcoulomb);
+#ifdef VDW_CUTOFF_CHECK
+    rcvdw2_SSE = gmx_set1_pr(ic->rvdw*ic->rvdw);
+#endif
+
+#ifdef CALC_ENERGIES
+    sixthSSE    = gmx_set1_pr(1.0/6.0);
+    twelvethSSE = gmx_set1_pr(1.0/12.0);
+
+    sh_invrc6_SSE  = gmx_set1_pr(ic->sh_invrc6);
+    sh_invrc12_SSE = gmx_set1_pr(ic->sh_invrc6*ic->sh_invrc6);
+#endif
+
+    mrc_3_SSE = gmx_set1_pr(-2*ic->k_rf);
+
+#ifdef CALC_ENERGIES
+    hrc_3_SSE = gmx_set1_pr(ic->k_rf);
+    
+    moh_rc_SSE = gmx_set1_pr(-ic->c_rf); 
+#endif
+
+#ifdef CALC_ENERGIES
+    tmpsum = (real *)(((size_t)(tmpsum_array+7)) & (~((size_t)31)));
+#endif
+#ifdef CALC_SHIFTFORCES
+    shf = (real *)(((size_t)(shf_array+7)) & (~((size_t)31)));
+#endif
+
+#ifdef FIX_LJ_C
+    pvdw_c6  = (real *)(((size_t)(pvdw_array+3)) & (~((size_t)15)));
+    pvdw_c12 = pvdw_c6 + UNROLLI*UNROLLJ;
+
+    for(jp=0; jp<UNROLLJ; jp++)
+    {
+        pvdw_c6 [0*UNROLLJ+jp] = nbat->nbfp[0*2];
+        pvdw_c6 [1*UNROLLJ+jp] = nbat->nbfp[0*2];
+        pvdw_c6 [2*UNROLLJ+jp] = nbat->nbfp[0*2];
+        pvdw_c6 [3*UNROLLJ+jp] = nbat->nbfp[0*2];
+
+        pvdw_c12[0*UNROLLJ+jp] = nbat->nbfp[0*2+1];
+        pvdw_c12[1*UNROLLJ+jp] = nbat->nbfp[0*2+1];
+        pvdw_c12[2*UNROLLJ+jp] = nbat->nbfp[0*2+1];
+        pvdw_c12[3*UNROLLJ+jp] = nbat->nbfp[0*2+1];
+    }
+    c6_SSE0            = gmx_load_pr(pvdw_c6 +0*UNROLLJ);
+    c6_SSE1            = gmx_load_pr(pvdw_c6 +1*UNROLLJ);
+    c6_SSE2            = gmx_load_pr(pvdw_c6 +2*UNROLLJ);
+    c6_SSE3            = gmx_load_pr(pvdw_c6 +3*UNROLLJ);
+
+    c12_SSE0           = gmx_load_pr(pvdw_c12+0*UNROLLJ);
+    c12_SSE1           = gmx_load_pr(pvdw_c12+1*UNROLLJ);
+    c12_SSE2           = gmx_load_pr(pvdw_c12+2*UNROLLJ);
+    c12_SSE3           = gmx_load_pr(pvdw_c12+3*UNROLLJ);
+#endif /* FIX_LJ_C */
+
+#ifdef ENERGY_GROUPS
+    egps_ishift  = nbat->neg_2log;
+    egps_imask   = (1<<egps_ishift) - 1;
+    egps_jshift  = 2*nbat->neg_2log;
+    egps_jmask   = (1<<egps_jshift) - 1;
+    egps_jstride = (UNROLLJ>>1)*UNROLLJ;
+    /* Major division is over i-particles: divide nVS by 4 for i-stride */
+    Vstride_i    = nbat->nenergrp*(1<<nbat->neg_2log)*egps_jstride;
+#endif
+
+    l_cj = nbl->cj;
+
+    ninner = 0;
+    for(n=0; n<nbl->nci; n++)
+    {
+        nbln = &nbl->ci[n];
+
+        ish              = (nbln->shift & NBNXN_CI_SHIFT);
+        ish3             = ish*3;
+        cjind0           = nbln->cj_ind_start;      
+        cjind1           = nbln->cj_ind_end;    
+        /* Currently only works super-cells equal to sub-cells */
+        ci               = nbln->ci;
+        ci_sh            = (ish == CENTRAL ? ci : -1);
+
+        shX_SSE = gmx_load1_pr(shiftvec+ish3);
+        shY_SSE = gmx_load1_pr(shiftvec+ish3+1);
+        shZ_SSE = gmx_load1_pr(shiftvec+ish3+2);
+
+#if UNROLLJ <= 4
+        sci              = ci*STRIDE;
+        scix             = sci*DIM;
+        sci2             = sci*2;
+#else
+        sci              = (ci>>1)*STRIDE;
+        scix             = sci*DIM + (ci & 1)*(STRIDE>>1);
+        sci2             = sci*2 + (ci & 1)*(STRIDE>>1);
+        sci             += (ci & 1)*(STRIDE>>1);
+#endif
+
+        half_LJ = (nbln->shift & NBNXN_CI_HALF_LJ(0));
+        do_coul = (nbln->shift & NBNXN_CI_DO_COUL(0));
+
+#ifdef ENERGY_GROUPS
+        egps_i = nbat->energrp[ci];
+        {
+            int ia,egp_ia;
+
+            for(ia=0; ia<UNROLLI; ia++)
+            {
+                egp_ia = (egps_i >> (ia*egps_ishift)) & egps_imask;
+                vvdwtp[ia] = Vvdw + egp_ia*Vstride_i;
+                vctp[ia]   = Vc   + egp_ia*Vstride_i;
+            }
+        }
+#endif
+#if defined CALC_ENERGIES
+#if UNROLLJ == 4
+        if (do_coul && l_cj[nbln->cj_ind_start].cj == ci_sh)
+#endif
+#if UNROLLJ == 2
+        if (do_coul && l_cj[nbln->cj_ind_start].cj == (ci_sh<<1))
+#endif
+#if UNROLLJ == 8
+        if (do_coul && l_cj[nbln->cj_ind_start].cj == (ci_sh>>1))
+#endif
+        {
+            int  ia;
+            real Vc_sub_self;
+
+#ifdef CALC_COUL_RF
+            Vc_sub_self = 0.5*ic->c_rf;
+#endif
+#ifdef CALC_COUL_TAB
+#ifdef TAB_FDV0
+            Vc_sub_self = 0.5*tab_coul_F[2];
+#else
+            Vc_sub_self = 0.5*tab_coul_V[0];
+#endif
+#endif
+#ifdef CALC_COUL_EWALD
+            /* beta/sqrt(pi) */
+            Vc_sub_self = 0.5*ic->ewaldcoeff*M_2_SQRTPI;
+#endif
+
+            for(ia=0; ia<UNROLLI; ia++)
+            {
+                real qi;
+
+                qi = q[sci+ia];
+#ifdef ENERGY_GROUPS
+                vctp[ia][((egps_i>>(ia*egps_ishift)) & egps_imask)*egps_jstride]
+#else
+                Vc[0]
+#endif
+                    -= facel*qi*qi*Vc_sub_self;
+            }
+        }
+#endif
+
+#define gmx_load2_hpr(x)  _mm256_insertf128_ps(gmx_load1_pr(x),gmx_load1_hpr(x+1),1)
+
+        /* Load i atom data */
+        sciy             = scix + STRIDE;
+        sciz             = sciy + STRIDE;
+        ix_SSE0          = gmx_add_pr(gmx_load2_hpr(x+scix)  ,shX_SSE);
+        ix_SSE2          = gmx_add_pr(gmx_load2_hpr(x+scix+2),shX_SSE);
+        iy_SSE0          = gmx_add_pr(gmx_load2_hpr(x+sciy)  ,shY_SSE);
+        iy_SSE2          = gmx_add_pr(gmx_load2_hpr(x+sciy+2),shY_SSE);
+        iz_SSE0          = gmx_add_pr(gmx_load2_hpr(x+sciz)  ,shZ_SSE);
+        iz_SSE2          = gmx_add_pr(gmx_load2_hpr(x+sciz+2),shZ_SSE);
+
+        /* With half_LJ we currently always calculate Coulomb interactions */
+        if (do_coul || half_LJ)
+        {
+            gmx_mm_pr facel_SSE;
+
+            facel_SSE    = gmx_set1_pr(facel);
+
+            iq_SSE0      = gmx_mul_pr(facel_SSE,gmx_load2_hpr(q+sci));
+            iq_SSE2      = gmx_mul_pr(facel_SSE,gmx_load2_hpr(q+sci+2));
+        }
+
+#ifdef LJ_COMB_LB
+        hsig_i_SSE0      = gmx_load2_hpr(ljc+sci2+0);
+        hsig_i_SSE2      = gmx_load2_hpr(ljc+sci2+2);
+        seps_i_SSE0      = gmx_load2_hpr(ljc+sci2+STRIDE+0);
+        seps_i_SSE2      = gmx_load2_hpr(ljc+sci2+STRIDE+2);
+#else
+#ifdef LJ_COMB_GEOM
+        c6s_SSE0         = gmx_load2_hpr(ljc+sci2+0);
+        if (!half_LJ)
+        {
+            c6s_SSE2     = gmx_load2_hpr(ljc+sci2+2);
+        }
+        c12s_SSE0        = gmx_load2_hpr(ljc+sci2+STRIDE+0);
+        if (!half_LJ)
+        {
+            c12s_SSE2    = gmx_load2_hpr(ljc+sci2+STRIDE+2);
+        }
+#else
+        nbfp0     = nbfp_ptr + type[sci  ]*nbat->ntype*nbfp_stride;
+        nbfp1     = nbfp_ptr + type[sci+1]*nbat->ntype*nbfp_stride;
+        if (!half_LJ)
+        {
+            nbfp2 = nbfp_ptr + type[sci+2]*nbat->ntype*nbfp_stride;
+            nbfp3 = nbfp_ptr + type[sci+3]*nbat->ntype*nbfp_stride;
+        }
+#endif
+#endif
+
+        /* Zero the potential energy for this list */
+        VvdwtotSSE       = gmx_setzero_pr();
+        vctotSSE         = gmx_setzero_pr();
+
+        /* Clear i atom forces */
+        fix_SSE0           = gmx_setzero_pr();
+        fix_SSE2           = gmx_setzero_pr();
+        fiy_SSE0           = gmx_setzero_pr();
+        fiy_SSE2           = gmx_setzero_pr();
+        fiz_SSE0           = gmx_setzero_pr();
+        fiz_SSE2           = gmx_setzero_pr();
+
+        cjind = cjind0;
+
+        /* Currently all kernels use (at least half) LJ */
+#define CALC_LJ
+        if (half_LJ)
+        {
+#define CALC_COULOMB
+#define HALF_LJ
+#define CHECK_EXCLS
+            while (cjind < cjind1 && nbl->cj[cjind].excl != SIMD_MASK_ALL)
+            {
+#include "nbnxn_kernel_simd_2xnn_inner.h"
+                cjind++;
+            }
+#undef CHECK_EXCLS
+            for(; (cjind<cjind1); cjind++)
+            {
+#include "nbnxn_kernel_simd_2xnn_inner.h"
+            }
+#undef HALF_LJ
+#undef CALC_COULOMB
+        }
+        else if (do_coul)
+        {
+#define CALC_COULOMB
+#define CHECK_EXCLS
+            while (cjind < cjind1 && nbl->cj[cjind].excl != SIMD_MASK_ALL)
+            {
+#include "nbnxn_kernel_simd_2xnn_inner.h"
+                cjind++;
+            }
+#undef CHECK_EXCLS
+            for(; (cjind<cjind1); cjind++)
+            {
+#include "nbnxn_kernel_simd_2xnn_inner.h"
+            }
+#undef CALC_COULOMB
+        }
+        else
+        {
+#define CHECK_EXCLS
+            while (cjind < cjind1 && nbl->cj[cjind].excl != SIMD_MASK_ALL)
+            {
+#include "nbnxn_kernel_simd_2xnn_inner.h"
+                cjind++;
+            }
+#undef CHECK_EXCLS
+            for(; (cjind<cjind1); cjind++)
+            {
+#include "nbnxn_kernel_simd_2xnn_inner.h"
+            }
+        }
+#undef CALC_LJ
+        ninner += cjind1 - cjind0;
+
+        /* Add accumulated i-forces to the force array */
+#if UNROLLJ >= 4
+#ifndef GMX_DOUBLE
+#define gmx_load_ps4  _mm_load_ps
+#define gmx_store_ps4 _mm_store_ps
+#define gmx_add_ps4   _mm_add_ps
+#else
+#define gmx_load_ps4  _mm256_load_pd
+#define gmx_store_ps4 _mm256_store_pd
+#define gmx_add_ps4   _mm256_add_pd
+#endif
+        GMX_MM_TRANSPOSE_SUM4H_PR(fix_SSE0,fix_SSE2,fix_SSE);
+        gmx_store_ps4(f+scix, gmx_add_ps4(fix_SSE, gmx_load_ps4(f+scix)));
+
+        GMX_MM_TRANSPOSE_SUM4H_PR(fiy_SSE0,fiy_SSE2,fiy_SSE);
+        gmx_store_ps4(f+sciy, gmx_add_ps4(fiy_SSE, gmx_load_ps4(f+sciy)));
+
+        GMX_MM_TRANSPOSE_SUM4H_PR(fiz_SSE0,fiz_SSE2,fiz_SSE);
+        gmx_store_ps4(f+sciz, gmx_add_ps4(fiz_SSE, gmx_load_ps4(f+sciz)));
+
+#ifdef CALC_SHIFTFORCES
+        gmx_store_ps4(shf,fix_SSE);
+        fshift[ish3+0] += SUM_SIMD4(shf);
+        gmx_store_ps4(shf,fiy_SSE);
+        fshift[ish3+1] += SUM_SIMD4(shf);
+        gmx_store_ps4(shf,fiz_SSE);
+        fshift[ish3+2] += SUM_SIMD4(shf);
+#endif
+#else
+        GMX_MM_TRANSPOSE_SUM2_PD(fix_SSE0,fix_SSE1,fix0_SSE);
+        _mm_store_pd(f+scix, _mm_add_pd(fix0_SSE, _mm_load_pd(f+scix)));
+        GMX_MM_TRANSPOSE_SUM2_PD(fix_SSE2,fix_SSE3,fix2_SSE);
+        _mm_store_pd(f+scix+2, _mm_add_pd(fix2_SSE, _mm_load_pd(f+scix+2)));
+
+        GMX_MM_TRANSPOSE_SUM2_PD(fiy_SSE0,fiy_SSE1,fiy0_SSE);
+        _mm_store_pd(f+sciy, _mm_add_pd(fiy0_SSE, _mm_load_pd(f+sciy)));
+        GMX_MM_TRANSPOSE_SUM2_PD(fiy_SSE2,fiy_SSE3,fiy2_SSE);
+        _mm_store_pd(f+sciy+2, _mm_add_pd(fiy2_SSE, _mm_load_pd(f+sciy+2)));
+
+        GMX_MM_TRANSPOSE_SUM2_PD(fiz_SSE0,fiz_SSE1,fiz0_SSE);
+        _mm_store_pd(f+sciz, _mm_add_pd(fiz0_SSE, _mm_load_pd(f+sciz)));
+        GMX_MM_TRANSPOSE_SUM2_PD(fiz_SSE2,fiz_SSE3,fiz2_SSE);
+        _mm_store_pd(f+sciz+2, _mm_add_pd(fiz2_SSE, _mm_load_pd(f+sciz+2)));
+
+#ifdef CALC_SHIFTFORCES
+        _mm_store_pd(shf,_mm_add_pd(fix0_SSE,fix2_SSE));
+        fshift[ish3+0] += shf[0] + shf[1];
+        _mm_store_pd(shf,_mm_add_pd(fiy0_SSE,fiy2_SSE));
+        fshift[ish3+1] += shf[0] + shf[1];
+        _mm_store_pd(shf,_mm_add_pd(fiz0_SSE,fiz2_SSE));
+        fshift[ish3+2] += shf[0] + shf[1];
+#endif
+#endif
+               
+#ifdef CALC_ENERGIES
+        if (do_coul)
+        {
+            gmx_store_pr(tmpsum,vctotSSE);
+            *Vc += SUM_SIMD(tmpsum);
+        }
+               
+        gmx_store_pr(tmpsum,VvdwtotSSE);
+        *Vvdw += SUM_SIMD(tmpsum);
+#endif
+               
+               /* Outer loop uses 6 flops/iteration */
+       }
+
+#ifdef COUNT_PAIRS
+    printf("atom pairs %d\n",npair);
+#endif
+}
+
+#undef gmx_load2_hpr
+
+#undef gmx_load_ps4
+#undef gmx_store_ps4
+#undef gmx_store_ps4
+
+#undef CALC_SHIFTFORCES
+
+#undef UNROLLI   
+#undef UNROLLJ   
+#undef STRIDE
+#undef TAB_FDV0
+#undef NBFP_STRIDE
similarity index 84%
rename from src/mdlib/nbnxn_kernels/nbnxn_kernel_x86_simd256.c
rename to src/mdlib/nbnxn_kernels/nbnxn_kernel_simd_4xn.c
index eec30d3da2cc3295070c89e961a342167fd47a07..470d27dcf7f96b6412979f1ecbbd3fd7f7dc4319 100644 (file)
 #include "../nbnxn_consts.h"
 #include "nbnxn_kernel_common.h"
 
-#ifdef GMX_X86_AVX_256
+#ifdef GMX_NBNXN_SIMD_4XN
 
-#include "nbnxn_kernel_x86_simd256.h"
+#include "nbnxn_kernel_simd_4xn.h"
 
-/* Include all flavors of the 256-bit AVX kernel loops */
+/* Include all flavors of the SSE or AVX 4xN kernel loops */
 
+#if GMX_NBNXN_SIMD_BITWIDTH == 128
+#define GMX_MM128_HERE
+#else
+#if GMX_NBNXN_SIMD_BITWIDTH == 256
 #define GMX_MM256_HERE
+#else
+#error "unsupported GMX_NBNXN_SIMD_BITWIDTH"
+#endif
+#endif
 
 /* Analytical reaction-field kernels */
 #define CALC_COUL_RF
 
-#include "nbnxn_kernel_x86_simd_includes.h"
+#include "nbnxn_kernel_simd_4xn_includes.h"
 
 #undef CALC_COUL_RF
 
 #define CALC_COUL_TAB
 
 /* Single cut-off: rcoulomb = rvdw */
-#include "nbnxn_kernel_x86_simd_includes.h"
+#include "nbnxn_kernel_simd_4xn_includes.h"
 
 /* Twin cut-off: rcoulomb >= rvdw */
 #define VDW_CUTOFF_CHECK
-#include "nbnxn_kernel_x86_simd_includes.h"
+#include "nbnxn_kernel_simd_4xn_includes.h"
 #undef VDW_CUTOFF_CHECK
 
 #undef CALC_COUL_TAB
 #define CALC_COUL_EWALD
 
 /* Single cut-off: rcoulomb = rvdw */
-#include "nbnxn_kernel_x86_simd_includes.h"
+#include "nbnxn_kernel_simd_4xn_includes.h"
 
 /* Twin cut-off: rcoulomb >= rvdw */
 #define VDW_CUTOFF_CHECK
-#include "nbnxn_kernel_x86_simd_includes.h"
+#include "nbnxn_kernel_simd_4xn_includes.h"
 #undef VDW_CUTOFF_CHECK
 
 #undef CALC_COUL_EWALD
@@ -109,7 +117,7 @@ typedef void (*p_nbk_func_noener)(const nbnxn_pairlist_t     *nbl,
 
 enum { coultRF, coultTAB, coultTAB_TWIN, coultEWALD, coultEWALD_TWIN, coultNR };
 
-#define NBK_FN(elec,ljcomb) nbnxn_kernel_x86_simd256_##elec##_comb_##ljcomb##_ener
+#define NBK_FN(elec,ljcomb) nbnxn_kernel_simd_4xn_##elec##_comb_##ljcomb##_ener
 static p_nbk_func_ener p_nbk_ener[coultNR][ljcrNR] =
 { { NBK_FN(rf        ,geom), NBK_FN(rf        ,lb), NBK_FN(rf        ,none) },
   { NBK_FN(tab       ,geom), NBK_FN(tab       ,lb), NBK_FN(tab       ,none) },
@@ -118,7 +126,7 @@ static p_nbk_func_ener p_nbk_ener[coultNR][ljcrNR] =
   { NBK_FN(ewald_twin,geom), NBK_FN(ewald_twin,lb), NBK_FN(ewald_twin,none) } };
 #undef NBK_FN
 
-#define NBK_FN(elec,ljcomb) nbnxn_kernel_x86_simd256_##elec##_comb_##ljcomb##_energrp
+#define NBK_FN(elec,ljcomb) nbnxn_kernel_simd_4xn_##elec##_comb_##ljcomb##_energrp
 static p_nbk_func_ener p_nbk_energrp[coultNR][ljcrNR] =
 { { NBK_FN(rf        ,geom), NBK_FN(rf        ,lb), NBK_FN(rf        ,none) },
   { NBK_FN(tab       ,geom), NBK_FN(tab       ,lb), NBK_FN(tab       ,none) },
@@ -127,7 +135,7 @@ static p_nbk_func_ener p_nbk_energrp[coultNR][ljcrNR] =
   { NBK_FN(ewald_twin,geom), NBK_FN(ewald_twin,lb), NBK_FN(ewald_twin,none) } };
 #undef NBK_FN
 
-#define NBK_FN(elec,ljcomb) nbnxn_kernel_x86_simd256_##elec##_comb_##ljcomb##_noener
+#define NBK_FN(elec,ljcomb) nbnxn_kernel_simd_4xn_##elec##_comb_##ljcomb##_noener
 static p_nbk_func_noener p_nbk_noener[coultNR][ljcrNR] =
 { { NBK_FN(rf        ,geom), NBK_FN(rf        ,lb), NBK_FN(rf        ,none) },
   { NBK_FN(tab       ,geom), NBK_FN(tab       ,lb), NBK_FN(tab       ,none) },
@@ -141,15 +149,14 @@ static void reduce_group_energies(int ng,int ng_2log,
                                   const real *VSvdw,const real *VSc,
                                   real *Vvdw,real *Vc)
 {
+    const int simd_width   = GMX_SIMD_WIDTH_HERE;
+    const int unrollj_half = GMX_SIMD_WIDTH_HERE/2;
     int ng_p2,i,j,j0,j1,c,s;
 
-#define SIMD_WIDTH       (GMX_X86_SIMD_WIDTH_HERE)
-#define SIMD_WIDTH_HALF  (GMX_X86_SIMD_WIDTH_HERE/2)
-
     ng_p2 = (1<<ng_2log);
 
     /* The size of the x86 SIMD energy group buffer array is:
-     * ng*ng*ng_p2*SIMD_WIDTH_HALF*SIMD_WIDTH
+     * ng*ng*ng_p2*unrollj_half*simd_width
      */
     for(i=0; i<ng; i++)
     {
@@ -163,34 +170,34 @@ static void reduce_group_energies(int ng,int ng_2log,
         {
             for(j0=0; j0<ng; j0++)
             {
-                c = ((i*ng + j1)*ng_p2 + j0)*SIMD_WIDTH_HALF*SIMD_WIDTH;
-                for(s=0; s<SIMD_WIDTH_HALF; s++)
+                c = ((i*ng + j1)*ng_p2 + j0)*unrollj_half*simd_width;
+                for(s=0; s<unrollj_half; s++)
                 {
                     Vvdw[i*ng+j0] += VSvdw[c+0];
                     Vvdw[i*ng+j1] += VSvdw[c+1];
                     Vc  [i*ng+j0] += VSc  [c+0];
                     Vc  [i*ng+j1] += VSc  [c+1];
-                    c += SIMD_WIDTH + 2;
+                    c += simd_width + 2;
                 }
             }
         }
     }
 }
 
-#endif /* GMX_X86_AVX_256 */
+#endif /* GMX_NBNXN_SIMD_4XN */
 
 void
-nbnxn_kernel_x86_simd256(nbnxn_pairlist_set_t       *nbl_list,
-                         const nbnxn_atomdata_t     *nbat,
-                         const interaction_const_t  *ic,
-                         int                        ewald_excl,
-                         rvec                       *shift_vec, 
-                         int                        force_flags,
-                         int                        clearF,
-                         real                       *fshift,
-                         real                       *Vc,
-                         real                       *Vvdw)
-#ifdef GMX_X86_AVX_256
+nbnxn_kernel_simd_4xn(nbnxn_pairlist_set_t       *nbl_list,
+                      const nbnxn_atomdata_t     *nbat,
+                      const interaction_const_t  *ic,
+                      int                        ewald_excl,
+                      rvec                       *shift_vec, 
+                      int                        force_flags,
+                      int                        clearF,
+                      real                       *fshift,
+                      real                       *Vc,
+                      real                       *Vvdw)
+#ifdef GMX_NBNXN_SIMD_4XN
 {
     int              nnbl;
     nbnxn_pairlist_t **nbl;
@@ -320,6 +327,6 @@ nbnxn_kernel_x86_simd256(nbnxn_pairlist_set_t       *nbl_list,
 }
 #else
 {
-    gmx_incons("nbnxn_kernel_x86_simd256 called while GROMACS was configured without AVX enabled");
+    gmx_incons("nbnxn_kernel_simd_4xn called while GROMACS was configured without 4xN SIMD kernels enabled");
 }
 #endif
similarity index 71%
rename from src/mdlib/nbnxn_kernels/nbnxn_kernel_x86_simd256.h
rename to src/mdlib/nbnxn_kernels/nbnxn_kernel_simd_4xn.h
index 9a8e53c9022ae7221cfc27c829896ff81f835623..a9d4d19802adc75155db56133959ecdaf9a74056 100644 (file)
@@ -35,8 +35,8 @@
  * To help us fund GROMACS development, we humbly ask that you cite
  * the research papers on the package. Check out http://www.gromacs.org.
  */
-#ifndef _nbnxn_kernel_x86_simd256_h
-#define _nbnxn_kernel_x86_simd256_h
+#ifndef _nbnxn_kernel_simd_4xn_h
+#define _nbnxn_kernel_simd_4xn_h
 
 #include "typedefs.h"
 
 extern "C" {
 #endif
 
-/* Wrapper call for the non-bonded cluster vs cluster kernels */
+/* Wrapper call for the non-bonded cluster vs cluster kernels.
+ * These kernels determine 4xN cluster interactions for SIMD width N.
+ */
 void
-nbnxn_kernel_x86_simd256(nbnxn_pairlist_set_t       *nbl_list,
-                         const nbnxn_atomdata_t     *nbat,
-                         const interaction_const_t  *ic,
-                         int                        ewald_excl,
-                         rvec                       *shift_vec,
-                         int                        force_flags,
-                         int                        clearF,
-                         real                       *fshift,
-                         real                       *Vc,
-                         real                       *Vvdw);
+nbnxn_kernel_simd_4xn(nbnxn_pairlist_set_t       *nbl_list,
+                      const nbnxn_atomdata_t     *nbat,
+                      const interaction_const_t  *ic,
+                      int                        ewald_excl,
+                      rvec                       *shift_vec,
+                      int                        force_flags,
+                      int                        clearF,
+                      real                       *fshift,
+                      real                       *Vc,
+                      real                       *Vvdw);
 
 #ifdef __cplusplus
 }
similarity index 86%
rename from src/mdlib/nbnxn_kernels/nbnxn_kernel_x86_simd_includes.h
rename to src/mdlib/nbnxn_kernels/nbnxn_kernel_simd_4xn_includes.h
index 15c52fb097c1105719cca0f32b01d323c6acc5a7..07da218f247e9987683ef2d43557ee7dd28210da 100644 (file)
 /* Include the force+energy kernels */
 #define CALC_ENERGIES
 #define LJ_COMB_GEOM
-#include "nbnxn_kernel_x86_simd_outer.h"
+#include "nbnxn_kernel_simd_4xn_outer.h"
 #undef LJ_COMB_GEOM
 #define LJ_COMB_LB
-#include "nbnxn_kernel_x86_simd_outer.h"
+#include "nbnxn_kernel_simd_4xn_outer.h"
 #undef LJ_COMB_LB
-#include "nbnxn_kernel_x86_simd_outer.h"
+#include "nbnxn_kernel_simd_4xn_outer.h"
 #undef CALC_ENERGIES
 
 /* Include the force+energygroups kernels */
 #define CALC_ENERGIES
 #define ENERGY_GROUPS
 #define LJ_COMB_GEOM
-#include "nbnxn_kernel_x86_simd_outer.h"
+#include "nbnxn_kernel_simd_4xn_outer.h"
 #undef LJ_COMB_GEOM
 #define LJ_COMB_LB
-#include "nbnxn_kernel_x86_simd_outer.h"
+#include "nbnxn_kernel_simd_4xn_outer.h"
 #undef LJ_COMB_LB
-#include "nbnxn_kernel_x86_simd_outer.h"
+#include "nbnxn_kernel_simd_4xn_outer.h"
 #undef ENERGY_GROUPS
 #undef CALC_ENERGIES
 
 /* Include the force only kernels */
 #define LJ_COMB_GEOM
-#include "nbnxn_kernel_x86_simd_outer.h"
+#include "nbnxn_kernel_simd_4xn_outer.h"
 #undef LJ_COMB_GEOM
 #define LJ_COMB_LB
-#include "nbnxn_kernel_x86_simd_outer.h"
+#include "nbnxn_kernel_simd_4xn_outer.h"
 #undef LJ_COMB_LB
-#include "nbnxn_kernel_x86_simd_outer.h"
+#include "nbnxn_kernel_simd_4xn_outer.h"
similarity index 99%
rename from src/mdlib/nbnxn_kernels/nbnxn_kernel_x86_simd_inner.h
rename to src/mdlib/nbnxn_kernels/nbnxn_kernel_simd_4xn_inner.h
index f1a0b9ccc82996fa94c6b750b637b6181d6f1e49..1676f1f43dc38c85901a37d8744d2afe45bdfee7 100644 (file)
@@ -35,8 +35,9 @@
  * the research papers on the package. Check out http://www.gromacs.org.
  */
 
-/* This is the innermost loop contents for the n vs n atom
- * SSE2 single precision kernels.
+/* This is the innermost loop contents for the 4 x N atom SIMD kernel.
+ * This flavor of the kernel calculates interactions of 4 i-atoms
+ * with N j-atoms stored in N wide SIMD registers.
  */
 
 
             gmx_mm_pr  r_SSE1,rs_SSE1,rf_SSE1,frac_SSE1;
             gmx_mm_pr  r_SSE2,rs_SSE2,rf_SSE2,frac_SSE2;
             gmx_mm_pr  r_SSE3,rs_SSE3,rf_SSE3,frac_SSE3;
-            /* Table index: rs converted to an int */ 
+            /* Table index: rs truncated to an int */
 #if !(defined GMX_MM256_HERE && defined GMX_DOUBLE)
             gmx_epi32  ti_SSE0,ti_SSE1,ti_SSE2,ti_SSE3;
 #else
             jxSSE         = gmx_load_pr(x+ajx);
             jySSE         = gmx_load_pr(x+ajy);
             jzSSE         = gmx_load_pr(x+ajz);
-            
+
             /* Calculate distance */
             dx_SSE0       = gmx_sub_pr(ix_SSE0,jxSSE);
             dy_SSE0       = gmx_sub_pr(iy_SSE0,jySSE);
             dx_SSE3       = gmx_sub_pr(ix_SSE3,jxSSE);
             dy_SSE3       = gmx_sub_pr(iy_SSE3,jySSE);
             dz_SSE3       = gmx_sub_pr(iz_SSE3,jzSSE);
-            
+
             /* rsq = dx*dx+dy*dy+dz*dz */
             rsq_SSE0      = gmx_calc_rsq_pr(dx_SSE0,dy_SSE0,dz_SSE0);
             rsq_SSE1      = gmx_calc_rsq_pr(dx_SSE1,dy_SSE1,dz_SSE1);
similarity index 95%
rename from src/mdlib/nbnxn_kernels/nbnxn_kernel_x86_simd_outer.h
rename to src/mdlib/nbnxn_kernels/nbnxn_kernel_simd_4xn_outer.h
index 0644f9776cd49ada8300b9f5c3f93c7df1732b01..1ab915deaecc3e5670ac0de2fffbd6b7da69bf78 100644 (file)
  */
 
 /* GMX_MM128_HERE or GMX_MM256_HERE should be set before including this file */
-#include "gmx_x86_simd_macros.h"
+#include "gmx_simd_macros.h"
 
 #define SUM_SIMD4(x) (x[0]+x[1]+x[2]+x[3])
 
 #define UNROLLI    NBNXN_CPU_CLUSTER_I_SIZE
-#define UNROLLJ    GMX_X86_SIMD_WIDTH_HERE
+#define UNROLLJ    GMX_SIMD_WIDTH_HERE
 
 #if defined GMX_MM128_HERE || defined GMX_DOUBLE
 #define STRIDE     4
@@ -74,7 +74,7 @@
 
 #define SIMD_MASK_ALL   0xffffffff
 
-#include "nbnxn_kernel_x86_simd_utils.h"
+#include "nbnxn_kernel_simd_utils.h"
 
 /* All functionality defines are set here, except for:
  * CALC_ENERGIES, ENERGY_GROUPS which are defined before.
 /* Assumes all LJ parameters are identical */
 /* #define FIX_LJ_C */
 
-#define NBK_FUNC_NAME_C_LJC(b,s,c,ljc,e) b##_##s##_##c##_comb_##ljc##_##e
+/* The NBK_FUNC_NAME... macros below generate the whole zoo of kernels names
+ * with all combinations off electrostatics (coul), LJ combination rules (ljc)
+ * and energy calculations (ene), depending on the defines set.
+ */
+
+#define NBK_FUNC_NAME_C_LJC(base,coul,ljc,ene) base##_##coul##_comb_##ljc##_##ene
 
 #if defined LJ_COMB_GEOM
-#define NBK_FUNC_NAME_C(b,s,c,e) NBK_FUNC_NAME_C_LJC(b,s,c,geom,e)
+#define NBK_FUNC_NAME_C(base,coul,ene) NBK_FUNC_NAME_C_LJC(base,coul,geom,ene)
 #else
 #if defined LJ_COMB_LB
-#define NBK_FUNC_NAME_C(b,s,c,e) NBK_FUNC_NAME_C_LJC(b,s,c,lb,e)
+#define NBK_FUNC_NAME_C(base,coul,ene) NBK_FUNC_NAME_C_LJC(base,coul,lb,ene)
 #else
-#define NBK_FUNC_NAME_C(b,s,c,e) NBK_FUNC_NAME_C_LJC(b,s,c,none,e)
+#define NBK_FUNC_NAME_C(base,coul,ene) NBK_FUNC_NAME_C_LJC(base,coul,none,ene)
 #endif
 #endif
 
 #ifdef CALC_COUL_RF
-#define NBK_FUNC_NAME(b,s,e) NBK_FUNC_NAME_C(b,s,rf,e)
+#define NBK_FUNC_NAME(base,ene) NBK_FUNC_NAME_C(base,rf,ene)
 #endif
 #ifdef CALC_COUL_TAB
 #ifndef VDW_CUTOFF_CHECK
-#define NBK_FUNC_NAME(b,s,e) NBK_FUNC_NAME_C(b,s,tab,e)
+#define NBK_FUNC_NAME(base,ene) NBK_FUNC_NAME_C(base,tab,ene)
 #else
-#define NBK_FUNC_NAME(b,s,e) NBK_FUNC_NAME_C(b,s,tab_twin,e)
+#define NBK_FUNC_NAME(base,ene) NBK_FUNC_NAME_C(base,tab_twin,ene)
 #endif
 #endif
 #ifdef CALC_COUL_EWALD
 #ifndef VDW_CUTOFF_CHECK
-#define NBK_FUNC_NAME(b,s,e) NBK_FUNC_NAME_C(b,s,ewald,e)
+#define NBK_FUNC_NAME(base,ene) NBK_FUNC_NAME_C(base,ewald,ene)
 #else
-#define NBK_FUNC_NAME(b,s,e) NBK_FUNC_NAME_C(b,s,ewald_twin,e)
+#define NBK_FUNC_NAME(base,ene) NBK_FUNC_NAME_C(base,ewald_twin,ene)
 #endif
 #endif
 
-#ifdef GMX_MM128_HERE
-#define NBK_FUNC_NAME_S128_OR_S256(b,e) NBK_FUNC_NAME(b,x86_simd128,e)
-#endif
-#ifdef GMX_MM256_HERE
-#define NBK_FUNC_NAME_S128_OR_S256(b,e) NBK_FUNC_NAME(b,x86_simd256,e)
-#endif
-
 static void
 #ifndef CALC_ENERGIES
-NBK_FUNC_NAME_S128_OR_S256(nbnxn_kernel,noener)
+NBK_FUNC_NAME(nbnxn_kernel_simd_4xn,noener)
 #else
 #ifndef ENERGY_GROUPS
-NBK_FUNC_NAME_S128_OR_S256(nbnxn_kernel,ener)
+NBK_FUNC_NAME(nbnxn_kernel_simd_4xn,ener)
 #else
-NBK_FUNC_NAME_S128_OR_S256(nbnxn_kernel,energrp)
+NBK_FUNC_NAME(nbnxn_kernel_simd_4xn,energrp)
 #endif
 #endif
 #undef NBK_FUNC_NAME
@@ -571,7 +569,7 @@ NBK_FUNC_NAME_S128_OR_S256(nbnxn_kernel,energrp)
         }
 #endif
 
-               /* Load i atom data */
+        /* Load i atom data */
         sciy             = scix + STRIDE;
         sciz             = sciy + STRIDE;
         ix_SSE0          = gmx_add_pr(gmx_load1_pr(x+scix)  ,shX_SSE);
@@ -661,13 +659,13 @@ NBK_FUNC_NAME_S128_OR_S256(nbnxn_kernel,energrp)
 #define CHECK_EXCLS
             while (cjind < cjind1 && nbl->cj[cjind].excl != SIMD_MASK_ALL)
             {
-#include "nbnxn_kernel_x86_simd_inner.h"
+#include "nbnxn_kernel_simd_4xn_inner.h"
                 cjind++;
             }
 #undef CHECK_EXCLS
             for(; (cjind<cjind1); cjind++)
             {
-#include "nbnxn_kernel_x86_simd_inner.h"
+#include "nbnxn_kernel_simd_4xn_inner.h"
             }
 #undef HALF_LJ
 #undef CALC_COULOMB
@@ -678,13 +676,13 @@ NBK_FUNC_NAME_S128_OR_S256(nbnxn_kernel,energrp)
 #define CHECK_EXCLS
             while (cjind < cjind1 && nbl->cj[cjind].excl != SIMD_MASK_ALL)
             {
-#include "nbnxn_kernel_x86_simd_inner.h"
+#include "nbnxn_kernel_simd_4xn_inner.h"
                 cjind++;
             }
 #undef CHECK_EXCLS
             for(; (cjind<cjind1); cjind++)
             {
-#include "nbnxn_kernel_x86_simd_inner.h"
+#include "nbnxn_kernel_simd_4xn_inner.h"
             }
 #undef CALC_COULOMB
         }
@@ -693,13 +691,13 @@ NBK_FUNC_NAME_S128_OR_S256(nbnxn_kernel,energrp)
 #define CHECK_EXCLS
             while (cjind < cjind1 && nbl->cj[cjind].excl != SIMD_MASK_ALL)
             {
-#include "nbnxn_kernel_x86_simd_inner.h"
+#include "nbnxn_kernel_simd_4xn_inner.h"
                 cjind++;
             }
 #undef CHECK_EXCLS
             for(; (cjind<cjind1); cjind++)
             {
-#include "nbnxn_kernel_x86_simd_inner.h"
+#include "nbnxn_kernel_simd_4xn_inner.h"
             }
         }
 #undef CALC_LJ
similarity index 90%
rename from src/mdlib/nbnxn_kernels/nbnxn_kernel_x86_simd_utils.h
rename to src/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils.h
index 0010a4fbe2f741f85bc612bf76a3b5e3075e66f9..45ab2aedcc9205f3d7d86d41cb3fbef186d09ef2 100644 (file)
     i_SSE1 = _mm256_hadd_ps(i_SSE0,i_SSE2);                             \
     o_SSE  = _mm_add_ps(_mm256_castps256_ps128(i_SSE1),_mm256_extractf128_ps(i_SSE1,1)); \
 }
+#define GMX_MM_TRANSPOSE_SUM4H_PR(i_SSE0,i_SSE2,o_SSE)                  \
+{                                                                       \
+    i_SSE0 = _mm256_hadd_ps(i_SSE0,_mm256_setzero_ps());                \
+    i_SSE2 = _mm256_hadd_ps(i_SSE2,_mm256_setzero_ps());                \
+    i_SSE0 = _mm256_hadd_ps(i_SSE0,i_SSE2);                             \
+    i_SSE2 = _mm256_permute_ps(i_SSE0,0b10110001);                      \
+    o_SSE  = _mm_add_ps(_mm256_castps256_ps128(i_SSE0),_mm256_extractf128_ps(i_SSE2,1)); \
+}
 #else
 #define GMX_MM_TRANSPOSE_SUM4_PR(i_SSE0,i_SSE1,i_SSE2,i_SSE3,o_SSE)     \
 {                                                                       \
@@ -228,6 +236,23 @@ gmx_mm256_invsqrt_ps_single(__m256 x)
     GMX_2_MM_TO_M256(c12t_SSE[0],c12t_SSE[1],c12_SSE);                  \
 }
 
+#define load_lj_pair_params2(nbfp,type,aj,c6_SSE,c12_SSE)                \
+{                                                                       \
+    __m128 clj_SSE[2*UNROLLJ],c6t_SSE[2],c12t_SSE[2];                     \
+    int p;                                                              \
+                                                                        \
+    for(p=0; p<2*UNROLLJ; p++)                                            \
+    {                                                                   \
+        /* Here we load 4 aligned floats, but we need just 2 */         \
+        clj_SSE[p] = _mm_load_ps(nbfp+type[aj+p]*NBFP_STRIDE);          \
+    }                                                                   \
+    GMX_MM_SHUFFLE_4_PS_FIL01_TO_2_PS(clj_SSE[0],clj_SSE[1],clj_SSE[2],clj_SSE[3],c6t_SSE[0],c12t_SSE[0]); \
+    GMX_MM_SHUFFLE_4_PS_FIL01_TO_2_PS(clj_SSE[4],clj_SSE[5],clj_SSE[6],clj_SSE[7],c6t_SSE[1],c12t_SSE[1]); \
+                                                                        \
+    GMX_2_MM_TO_M256(c6t_SSE[0],c6t_SSE[1],c6_SSE);                     \
+    GMX_2_MM_TO_M256(c12t_SSE[0],c12t_SSE[1],c12_SSE);                  \
+}
+
 #endif
 
 #if defined GMX_MM128_HERE && defined GMX_DOUBLE
@@ -474,7 +499,7 @@ gmx_mm256_invsqrt_ps_single(__m256 x)
 /* Add energy register to possibly multiple terms in the energy array.
  * This function is the same for SSE/AVX single/double.
  */
-static inline void add_ener_grp(gmx_mm_pr e_SSE,real *v,int *offset_jj)
+static inline void add_ener_grp(gmx_mm_pr e_SSE,real *v,const int *offset_jj)
 {
     int jj;
 
@@ -486,9 +511,39 @@ static inline void add_ener_grp(gmx_mm_pr e_SSE,real *v,int *offset_jj)
     {
         gmx_mm_pr v_SSE;
 
-        v_SSE = gmx_load_pr(v+offset_jj[jj]+jj*UNROLLJ);
-        gmx_store_pr(v+offset_jj[jj]+jj*UNROLLJ,gmx_add_pr(v_SSE,e_SSE));
+        v_SSE = gmx_load_pr(v+offset_jj[jj]+jj*GMX_SIMD_WIDTH_HERE);
+        gmx_store_pr(v+offset_jj[jj]+jj*GMX_SIMD_WIDTH_HERE,gmx_add_pr(v_SSE,e_SSE));
     }
 }
 
+#if defined GMX_X86_AVX_256 && GMX_SIMD_WIDTH_HERE == 8
+/* As add_ener_grp above, but for two groups of UNROLLJ/2 stored in
+ * a single SIMD register.
+ */
+static inline void add_ener_grp_halves(gmx_mm_pr e_SSE,
+                                       real *v0,real *v1,const int *offset_jj)
+{
+    gmx_mm_hpr e_SSE0,e_SSE1;
+    int jj;
+
+    e_SSE0 = _mm256_extractf128_ps(e_SSE,0);
+    e_SSE1 = _mm256_extractf128_ps(e_SSE,1);
+
+    for(jj=0; jj<(UNROLLJ/2); jj++)
+    {
+        gmx_mm_hpr v_SSE;
+
+        v_SSE = gmx_load_hpr(v0+offset_jj[jj]+jj*GMX_SIMD_WIDTH_HERE/2);
+        gmx_store_hpr(v0+offset_jj[jj]+jj*GMX_SIMD_WIDTH_HERE/2,gmx_add_hpr(v_SSE,e_SSE0));
+    }
+    for(jj=0; jj<(UNROLLJ/2); jj++)
+    {
+        gmx_mm_hpr v_SSE;
+
+        v_SSE = gmx_load_hpr(v1+offset_jj[jj]+jj*GMX_SIMD_WIDTH_HERE/2);
+        gmx_store_hpr(v1+offset_jj[jj]+jj*GMX_SIMD_WIDTH_HERE/2,gmx_add_hpr(v_SSE,e_SSE1));
+    }
+}
+#endif
+
 #endif /* _nbnxn_kernel_sse_utils_h_ */
index f356973898af2f48b8d0cdacac50b0d71ddb2a31..0358e3523d40217794eafd3a8067056093e33f36 100644 (file)
 
 #ifndef GMX_DOUBLE
 #define NBNXN_SEARCH_SSE_SINGLE
-#include "gmx_x86_simd_single.h"
-#else
-#include "gmx_x86_simd_double.h"
 #endif
 
+/* Include basic SSE2 stuff */
+#include <emmintrin.h>
+
 #if defined NBNXN_SEARCH_SSE_SINGLE && GPU_NSUBCELL == 8
 #define NBNXN_8BB_SSE
 #endif
@@ -94,6 +94,9 @@
 #define STRIDE_8BB        4
 #define STRIDE_8BB_2LOG   2
 
+#endif /* NBNXN_SEARCH_SSE */
+
+#ifdef GMX_NBNXN_SIMD
 
 /* The functions below are macros as they are performance sensitive */
 
 #define X_IND_CJ_J8(cj)  ((cj)*STRIDE_P8)
 
 /* The j-cluster size is matched to the SIMD width */
-#ifndef GMX_DOUBLE
-/* 128 bits can hold 4 floats */
-#define CI_TO_CJ_S128(ci)  CI_TO_CJ_J4(ci)
-#define X_IND_CI_S128(ci)  X_IND_CI_J4(ci)
-#define X_IND_CJ_S128(cj)  X_IND_CJ_J4(cj)
-/* 256 bits can hold 8 floats */
-#define CI_TO_CJ_S256(ci)  CI_TO_CJ_J8(ci)
-#define X_IND_CI_S256(ci)  X_IND_CI_J8(ci)
-#define X_IND_CJ_S256(cj)  X_IND_CJ_J8(cj)
+#if GMX_NBNXN_SIMD_BITWIDTH == 128
+#ifdef GMX_DOUBLE
+#define CI_TO_CJ_SIMD_4XN(ci)  CI_TO_CJ_J2(ci)
+#define X_IND_CI_SIMD_4XN(ci)  X_IND_CI_J2(ci)
+#define X_IND_CJ_SIMD_4XN(cj)  X_IND_CJ_J2(cj)
+#else
+#define CI_TO_CJ_SIMD_4XN(ci)  CI_TO_CJ_J4(ci)
+#define X_IND_CI_SIMD_4XN(ci)  X_IND_CI_J4(ci)
+#define X_IND_CJ_SIMD_4XN(cj)  X_IND_CJ_J4(cj)
+#endif
+#else
+#if GMX_NBNXN_SIMD_BITWIDTH == 256
+#ifdef GMX_DOUBLE
+#define CI_TO_CJ_SIMD_4XN(ci)  CI_TO_CJ_J4(ci)
+#define X_IND_CI_SIMD_4XN(ci)  X_IND_CI_J4(ci)
+#define X_IND_CJ_SIMD_4XN(cj)  X_IND_CJ_J4(cj)
+#else
+#define CI_TO_CJ_SIMD_4XN(ci)  CI_TO_CJ_J8(ci)
+#define X_IND_CI_SIMD_4XN(ci)  X_IND_CI_J8(ci)
+#define X_IND_CJ_SIMD_4XN(cj)  X_IND_CJ_J8(cj)
+/* Half SIMD with j-cluster size */
+#define CI_TO_CJ_SIMD_2XNN(ci) CI_TO_CJ_J4(ci)
+#define X_IND_CI_SIMD_2XNN(ci) X_IND_CI_J4(ci)
+#define X_IND_CJ_SIMD_2XNN(cj) X_IND_CJ_J4(cj)
+#endif
 #else
-/* 128 bits can hold 2 doubles */
-#define CI_TO_CJ_S128(ci)  CI_TO_CJ_J2(ci)
-#define X_IND_CI_S128(ci)  X_IND_CI_J2(ci)
-#define X_IND_CJ_S128(cj)  X_IND_CJ_J2(cj)
-/* 256 bits can hold 4 doubles */
-#define CI_TO_CJ_S256(ci)  CI_TO_CJ_J4(ci)
-#define X_IND_CI_S256(ci)  X_IND_CI_J4(ci)
-#define X_IND_CJ_S256(cj)  X_IND_CJ_J4(cj)
+#error "unsupported GMX_NBNXN_SIMD_WIDTH"
+#endif
 #endif
 
-#endif /* NBNXN_SEARCH_SSE */
+#endif /* GMX_NBNXN_SIMD */
 
 
 /* Interaction masks for 4xN atom interactions.
@@ -253,12 +266,12 @@ static int nbnxn_kernel_to_ci_size(int nb_kernel_type)
 {
     switch (nb_kernel_type)
     {
-    case nbk4x4_PlainC:
-    case nbk4xN_X86_SIMD128:
-    case nbk4xN_X86_SIMD256:
+    case nbnxnk4x4_PlainC:
+    case nbnxnk4xN_SIMD_4xN:
+    case nbnxnk4xN_SIMD_2xNN:
         return NBNXN_CPU_CLUSTER_I_SIZE;
-    case nbk8x8x8_CUDA:
-    case nbk8x8x8_PlainC:
+    case nbnxnk8x8x8_CUDA:
+    case nbnxnk8x8x8_PlainC:
         /* The cluster size for super/sub lists is only set here.
          * Any value should work for the pair-search and atomdata code.
          * The kernels, of course, might require a particular value.
@@ -273,24 +286,33 @@ static int nbnxn_kernel_to_ci_size(int nb_kernel_type)
 
 int nbnxn_kernel_to_cj_size(int nb_kernel_type)
 {
+    int nbnxn_simd_width=0;
+    int cj_size=0;
+
+#ifdef GMX_NBNXN_SIMD
+    nbnxn_simd_width = GMX_NBNXN_SIMD_BITWIDTH/(sizeof(real)*8);
+#endif
+
     switch (nb_kernel_type)
     {
-    case nbk4x4_PlainC:
-        return NBNXN_CPU_CLUSTER_I_SIZE;
-    case nbk4xN_X86_SIMD128:
-        /* Number of reals that fit in SIMD (128 bits = 16 bytes) */
-        return 16/sizeof(real);
-    case nbk4xN_X86_SIMD256:
-        /* Number of reals that fit in SIMD (256 bits = 32 bytes) */
-        return 32/sizeof(real);
-    case nbk8x8x8_CUDA:
-    case nbk8x8x8_PlainC:
-        return nbnxn_kernel_to_ci_size(nb_kernel_type);
+    case nbnxnk4x4_PlainC:
+        cj_size = NBNXN_CPU_CLUSTER_I_SIZE;
+        break;
+    case nbnxnk4xN_SIMD_4xN:
+        cj_size = nbnxn_simd_width;
+        break;
+    case nbnxnk4xN_SIMD_2xNN:
+        cj_size = nbnxn_simd_width/2;
+        break;
+    case nbnxnk8x8x8_CUDA:
+    case nbnxnk8x8x8_PlainC:
+        cj_size = nbnxn_kernel_to_ci_size(nb_kernel_type);
+        break;
     default:
         gmx_incons("unknown kernel type");
     }
 
-    return 0;
+    return cj_size;
 }
 
 static int ci_to_cj(int na_cj_2log,int ci)
@@ -307,20 +329,20 @@ static int ci_to_cj(int na_cj_2log,int ci)
 
 gmx_bool nbnxn_kernel_pairlist_simple(int nb_kernel_type)
 {
-    if (nb_kernel_type == nbkNotSet)
+    if (nb_kernel_type == nbnxnkNotSet)
     {
         gmx_fatal(FARGS, "Non-bonded kernel type not set for Verlet-style pair-list.");
     }
 
     switch (nb_kernel_type)
     {
-    case nbk8x8x8_CUDA:
-    case nbk8x8x8_PlainC:
+    case nbnxnk8x8x8_CUDA:
+    case nbnxnk8x8x8_PlainC:
         return FALSE;
 
-    case nbk4x4_PlainC:
-    case nbk4xN_X86_SIMD128:
-    case nbk4xN_X86_SIMD256:
+    case nbnxnk4x4_PlainC:
+    case nbnxnk4xN_SIMD_4xN:
+    case nbnxnk4xN_SIMD_2xNN:
         return TRUE;
 
     default:
@@ -2360,18 +2382,16 @@ static void nbnxn_init_pairlist(nbnxn_pairlist_t *nbl,
 
     snew(nbl->work,1);
 #ifdef NBNXN_BBXXXX
-    snew_aligned(nbl->work->bb_ci,GPU_NSUBCELL/STRIDE_8BB*NNBSBB_XXXX,16);
+    snew_aligned(nbl->work->bb_ci,GPU_NSUBCELL/STRIDE_8BB*NNBSBB_XXXX,32);
 #else
-    snew_aligned(nbl->work->bb_ci,GPU_NSUBCELL*NNBSBB_B,16);
-#endif
-    snew_aligned(nbl->work->x_ci,NBNXN_NA_SC_MAX*DIM,16);
-#ifdef NBNXN_SEARCH_SSE
-    snew_aligned(nbl->work->x_ci_x86_simd128,1,16);
-#ifdef GMX_X86_AVX_256
-    snew_aligned(nbl->work->x_ci_x86_simd256,1,32);
+    snew_aligned(nbl->work->bb_ci,GPU_NSUBCELL*NNBSBB_B,32);
 #endif
+    snew_aligned(nbl->work->x_ci,NBNXN_NA_SC_MAX*DIM,32);
+#ifdef GMX_NBNXN_SIMD
+    snew_aligned(nbl->work->x_ci_simd_4xn,1,32);
+    snew_aligned(nbl->work->x_ci_simd_2xnn,1,32);
 #endif
-    snew_aligned(nbl->work->d2,GPU_NSUBCELL,16);
+    snew_aligned(nbl->work->d2,GPU_NSUBCELL,32);
 }
 
 void nbnxn_init_pairlist_set(nbnxn_pairlist_set_t *nbl_list,
@@ -2626,7 +2646,6 @@ static unsigned int get_imask(gmx_bool rdiag,int ci,int cj)
     return (rdiag && ci == cj ? NBNXN_INT_MASK_DIAG : NBNXN_INT_MASK_ALL);
 }
 
-#ifdef NBNXN_SEARCH_SSE
 /* Returns a diagonal or off-diagonal interaction mask for SIMD128 lists */
 static unsigned int get_imask_x86_simd128(gmx_bool rdiag,int ci,int cj)
 {
@@ -2639,7 +2658,6 @@ static unsigned int get_imask_x86_simd128(gmx_bool rdiag,int ci,int cj)
 #endif
 }
 
-#ifdef GMX_X86_AVX_256
 /* Returns a diagonal or off-diagonal interaction mask for SIMD256 lists */
 static unsigned int get_imask_x86_simd256(gmx_bool rdiag,int ci,int cj)
 {
@@ -2647,12 +2665,23 @@ static unsigned int get_imask_x86_simd256(gmx_bool rdiag,int ci,int cj)
     return (rdiag && ci == cj*2 ? NBNXN_INT_MASK_DIAG_J8_0 :
             (rdiag && ci == cj*2+1 ? NBNXN_INT_MASK_DIAG_J8_1 :
              NBNXN_INT_MASK_ALL));
-#else              /* cj-size = 2 */
+#else              /* cj-size = 4 */
     return (rdiag && ci == cj ? NBNXN_INT_MASK_DIAG : NBNXN_INT_MASK_ALL);
 #endif
 }
+
+#ifdef GMX_NBNXN_SIMD
+#if GMX_NBNXN_SIMD_BITWIDTH == 128
+#define get_imask_x86_simd_4xn  get_imask_x86_simd128
+#else
+#if GMX_NBNXN_SIMD_BITWIDTH == 256
+#define get_imask_x86_simd_4xn  get_imask_x86_simd256
+#define get_imask_x86_simd_2xnn get_imask_x86_simd128
+#else
+#error "unsupported GMX_NBNXN_SIMD_BITWIDTH"
+#endif
+#endif
 #endif
-#endif /* NBNXN_SEARCH_SSE */
 
 /* Plain C code for making a pair list of cell ci vs cell cjf-cjl.
  * Checks bounding box distances and possibly atom pair distances.
@@ -2773,23 +2802,11 @@ static void make_cluster_list_simple(const nbnxn_grid_t *gridj,
     }
 }
 
-#ifdef NBNXN_SEARCH_SSE
-/* Include make_cluster_list_x86_simd128/256 */
-#define GMX_MM128_HERE
-#include "gmx_x86_simd_macros.h"
-#define STRIDE_S  PACK_X4
-#include "nbnxn_search_x86_simd.h"
-#undef STRIDE_S
-#undef GMX_MM128_HERE
-#ifdef GMX_X86_AVX_256
-/* Include make_cluster_list_x86_simd128/256 */
-#define GMX_MM256_HERE
-#include "gmx_x86_simd_macros.h"
-#define STRIDE_S  GMX_X86_SIMD_WIDTH_HERE
-#include "nbnxn_search_x86_simd.h"
-#undef STRIDE_S
-#undef GMX_MM256_HERE
+#ifdef GMX_NBNXN_SIMD_4XN
+#include "nbnxn_search_simd_4xn.h"
 #endif
+#ifdef GMX_NBNXN_SIMD_2XNN
+#include "nbnxn_search_simd_2xnn.h"
 #endif
 
 /* Plain C or SSE code for making a pair list of super-cell sci vs scj.
@@ -4495,7 +4512,7 @@ static void nbnxn_make_pairlist_part(const nbnxn_search_t nbs,
 
                                     switch (nb_kernel_type)
                                     {
-                                    case nbk4x4_PlainC:
+                                    case nbnxnk4x4_PlainC:
                                         check_subcell_list_space_simple(nbl,cl-cf+1);
 
                                         make_cluster_list_simple(gridj,
@@ -4505,30 +4522,30 @@ static void nbnxn_make_pairlist_part(const nbnxn_search_t nbs,
                                                                  rl2,rbb2,
                                                                  &ndistc);
                                         break;
-#ifdef NBNXN_SEARCH_SSE
-                                    case nbk4xN_X86_SIMD128:
+#ifdef GMX_NBNXN_SIMD_4XN
+                                    case nbnxnk4xN_SIMD_4xN:
                                         check_subcell_list_space_simple(nbl,ci_to_cj(na_cj_2log,cl-cf)+2);
-                                        make_cluster_list_x86_simd128(gridj,
-                                                                      nbl,ci,cf,cl,
-                                                                      (gridi == gridj && shift == CENTRAL),
-                                                                      nbat->x,
-                                                                      rl2,rbb2,
-                                                                      &ndistc);
+                                        make_cluster_list_simd_4xn(gridj,
+                                                                   nbl,ci,cf,cl,
+                                                                   (gridi == gridj && shift == CENTRAL),
+                                                                   nbat->x,
+                                                                   rl2,rbb2,
+                                                                   &ndistc);
                                         break;
-#ifdef GMX_X86_AVX_256
-                                    case nbk4xN_X86_SIMD256:
+#endif
+#ifdef GMX_NBNXN_SIMD_2XNN
+                                    case nbnxnk4xN_SIMD_2xNN:
                                         check_subcell_list_space_simple(nbl,ci_to_cj(na_cj_2log,cl-cf)+2);
-                                        make_cluster_list_x86_simd256(gridj,
-                                                                      nbl,ci,cf,cl,
-                                                                      (gridi == gridj && shift == CENTRAL),
-                                                                      nbat->x,
-                                                                      rl2,rbb2,
-                                                                      &ndistc);
+                                        make_cluster_list_simd_2xnn(gridj,
+                                                                   nbl,ci,cf,cl,
+                                                                   (gridi == gridj && shift == CENTRAL),
+                                                                   nbat->x,
+                                                                   rl2,rbb2,
+                                                                   &ndistc);
                                         break;
 #endif
-#endif
-                                    case nbk8x8x8_PlainC:
-                                    case nbk8x8x8_CUDA:
+                                    case nbnxnk8x8x8_PlainC:
+                                    case nbnxnk8x8x8_CUDA:
                                         check_subcell_list_space_supersub(nbl,cl-cf+1);
                                         for(cj=cf; cj<=cl; cj++)
                                         {
@@ -4728,15 +4745,15 @@ void nbnxn_make_pairlist(const nbnxn_search_t nbs,
     {
         switch (nb_kernel_type)
         {
-#ifdef NBNXN_SEARCH_SSE
-        case nbk4xN_X86_SIMD128:
-            nbs->icell_set_x = icell_set_x_x86_simd128;
-            break;
-#ifdef GMX_X86_AVX_256
-        case nbk4xN_X86_SIMD256:
-            nbs->icell_set_x = icell_set_x_x86_simd256;
+#ifdef GMX_NBNXN_SIMD_4XN
+        case nbnxnk4xN_SIMD_4xN:
+            nbs->icell_set_x = icell_set_x_simd_4xn;
             break;
 #endif
+#ifdef GMX_NBNXN_SIMD_2XNN
+        case nbnxnk4xN_SIMD_2xNN:
+            nbs->icell_set_x = icell_set_x_simd_2xnn;
+            break;
 #endif
         default:
             nbs->icell_set_x = icell_set_x_simple;
diff --git a/src/mdlib/nbnxn_search_simd_2xnn.h b/src/mdlib/nbnxn_search_simd_2xnn.h
new file mode 100644 (file)
index 0000000..04dd501
--- /dev/null
@@ -0,0 +1,262 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2012, The GROMACS development team,
+ * check out http://www.gromacs.org for more information.
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+#if GMX_NBNXN_SIMD_BITWIDTH == 128
+#define GMX_MM128_HERE
+#else
+#if GMX_NBNXN_SIMD_BITWIDTH == 256
+#define GMX_MM256_HERE
+#else
+#error "unsupported GMX_NBNXN_SIMD_BITWIDTH"
+#endif
+#endif
+#include "gmx_simd_macros.h"
+
+#if GMX_SIMD_WIDTH_HERE >= 2*NBNXN_CPU_CLUSTER_I_SIZE
+#define STRIDE_S  (GMX_SIMD_WIDTH_HERE/2)
+#else
+#define STRIDE_S  NBNXN_CPU_CLUSTER_I_SIZE
+#endif
+
+static gmx_inline gmx_mm_pr gmx_load_hpr_hilo_pr(const real *a)
+{
+    gmx_mm_hpr a_SSE;
+
+    a_SSE = _mm_load_ps(a);
+
+    return gmx_2hpr_to_pr(a_SSE,a_SSE);
+}
+
+static gmx_inline gmx_mm_pr gmx_set_2real_shift_pr(const real *a,real shift)
+{
+    gmx_mm_hpr a0,a1;
+
+    a0 = _mm_set1_ps(a[0] + shift);
+    a1 = _mm_set1_ps(a[1] + shift);
+
+    return gmx_2hpr_to_pr(a1,a0);
+}
+
+/* Copies PBC shifted i-cell packed atom coordinates to working array */
+static gmx_inline void
+icell_set_x_simd_2xnn(int ci,
+                      real shx,real shy,real shz,
+                      int na_c,
+                      int stride,const real *x,
+                      nbnxn_list_work_t *work)
+{
+    int  ia;
+    nbnxn_x_ci_simd_2xnn_t *x_ci;
+
+    x_ci = work->x_ci_simd_2xnn;
+
+    ia = X_IND_CI_SIMD_2XNN(ci);
+
+    x_ci->ix_SSE0 = gmx_set_2real_shift_pr(x + ia + 0*STRIDE_S + 0, shx);
+    x_ci->iy_SSE0 = gmx_set_2real_shift_pr(x + ia + 1*STRIDE_S + 0, shy);
+    x_ci->iz_SSE0 = gmx_set_2real_shift_pr(x + ia + 2*STRIDE_S + 0, shz);
+    x_ci->ix_SSE2 = gmx_set_2real_shift_pr(x + ia + 0*STRIDE_S + 2, shx);
+    x_ci->iy_SSE2 = gmx_set_2real_shift_pr(x + ia + 1*STRIDE_S + 2, shy);
+    x_ci->iz_SSE2 = gmx_set_2real_shift_pr(x + ia + 2*STRIDE_S + 2, shz);
+}
+
+/* SIMD code for making a pair list of cell ci vs cell cjf-cjl
+ * for coordinates in packed format.
+ * Checks bouding box distances and possibly atom pair distances.
+ * This is an accelerated version of make_cluster_list_simple.
+ */
+static gmx_inline void
+make_cluster_list_simd_2xnn(const nbnxn_grid_t *gridj,
+                            nbnxn_pairlist_t *nbl,
+                            int ci,int cjf,int cjl,
+                            gmx_bool remove_sub_diag,
+                            const real *x_j,
+                            real rl2,float rbb2,
+                            int *ndistc)
+{
+    const nbnxn_x_ci_simd_2xnn_t *work;
+    const float *bb_ci;
+
+    gmx_mm_pr  jx_SSE,jy_SSE,jz_SSE;
+
+    gmx_mm_pr  dx_SSE0,dy_SSE0,dz_SSE0;
+    gmx_mm_pr  dx_SSE2,dy_SSE2,dz_SSE2;
+
+    gmx_mm_pr  rsq_SSE0;
+    gmx_mm_pr  rsq_SSE2;
+
+    gmx_mm_pr  wco_SSE0;
+    gmx_mm_pr  wco_SSE2;
+    gmx_mm_pr  wco_any_SSE;
+
+    gmx_mm_pr  rc2_SSE;
+
+    gmx_bool   InRange;
+    float      d2;
+    int        xind_f,xind_l,cj;
+
+    cjf = CI_TO_CJ_SIMD_2XNN(cjf);
+    cjl = CI_TO_CJ_SIMD_2XNN(cjl+1) - 1;
+
+    work = nbl->work->x_ci_simd_2xnn;
+
+    bb_ci = nbl->work->bb_ci;
+
+    rc2_SSE   = gmx_set1_pr(rl2);
+
+    InRange = FALSE;
+    while (!InRange && cjf <= cjl)
+    {
+        d2 = subc_bb_dist2_sse(4,0,bb_ci,cjf,gridj->bbj);
+        *ndistc += 2;
+
+        /* Check if the distance is within the distance where
+         * we use only the bounding box distance rbb,
+         * or within the cut-off and there is at least one atom pair
+         * within the cut-off.
+         */
+        if (d2 < rbb2)
+        {
+            InRange = TRUE;
+        }
+        else if (d2 < rl2)
+        {
+            xind_f  = X_IND_CJ_SIMD_2XNN(CI_TO_CJ_SIMD_2XNN(gridj->cell0) + cjf);
+
+            jx_SSE  = gmx_load_hpr_hilo_pr(x_j+xind_f+0*STRIDE_S);
+            jy_SSE  = gmx_load_hpr_hilo_pr(x_j+xind_f+1*STRIDE_S);
+            jz_SSE  = gmx_load_hpr_hilo_pr(x_j+xind_f+2*STRIDE_S);
+
+            /* Calculate distance */
+            dx_SSE0            = gmx_sub_pr(work->ix_SSE0,jx_SSE);
+            dy_SSE0            = gmx_sub_pr(work->iy_SSE0,jy_SSE);
+            dz_SSE0            = gmx_sub_pr(work->iz_SSE0,jz_SSE);
+            dx_SSE2            = gmx_sub_pr(work->ix_SSE2,jx_SSE);
+            dy_SSE2            = gmx_sub_pr(work->iy_SSE2,jy_SSE);
+            dz_SSE2            = gmx_sub_pr(work->iz_SSE2,jz_SSE);
+
+            /* rsq = dx*dx+dy*dy+dz*dz */
+            rsq_SSE0           = gmx_calc_rsq_pr(dx_SSE0,dy_SSE0,dz_SSE0);
+            rsq_SSE2           = gmx_calc_rsq_pr(dx_SSE2,dy_SSE2,dz_SSE2);
+
+            wco_SSE0           = gmx_cmplt_pr(rsq_SSE0,rc2_SSE);
+            wco_SSE2           = gmx_cmplt_pr(rsq_SSE2,rc2_SSE);
+
+            wco_any_SSE        = gmx_or_pr(wco_SSE0,wco_SSE2);
+
+            InRange            = gmx_movemask_pr(wco_any_SSE);
+
+            *ndistc += 2*GMX_SIMD_WIDTH_HERE;
+        }
+        if (!InRange)
+        {
+            cjf++;
+        }
+    }
+    if (!InRange)
+    {
+        return;
+    }
+
+    InRange = FALSE;
+    while (!InRange && cjl > cjf)
+    {
+        d2 = subc_bb_dist2_sse(4,0,bb_ci,cjl,gridj->bbj);
+        *ndistc += 2;
+        
+        /* Check if the distance is within the distance where
+         * we use only the bounding box distance rbb,
+         * or within the cut-off and there is at least one atom pair
+         * within the cut-off.
+         */
+        if (d2 < rbb2)
+        {
+            InRange = TRUE;
+        }
+        else if (d2 < rl2)
+        {
+            xind_l  = X_IND_CJ_SIMD_2XNN(CI_TO_CJ_SIMD_2XNN(gridj->cell0) + cjl);
+
+            jx_SSE  = gmx_load_hpr_hilo_pr(x_j+xind_l+0*STRIDE_S);
+            jy_SSE  = gmx_load_hpr_hilo_pr(x_j+xind_l+1*STRIDE_S);
+            jz_SSE  = gmx_load_hpr_hilo_pr(x_j+xind_l+2*STRIDE_S);
+
+            /* Calculate distance */
+            dx_SSE0            = gmx_sub_pr(work->ix_SSE0,jx_SSE);
+            dy_SSE0            = gmx_sub_pr(work->iy_SSE0,jy_SSE);
+            dz_SSE0            = gmx_sub_pr(work->iz_SSE0,jz_SSE);
+            dx_SSE2            = gmx_sub_pr(work->ix_SSE2,jx_SSE);
+            dy_SSE2            = gmx_sub_pr(work->iy_SSE2,jy_SSE);
+            dz_SSE2            = gmx_sub_pr(work->iz_SSE2,jz_SSE);
+
+            /* rsq = dx*dx+dy*dy+dz*dz */
+            rsq_SSE0           = gmx_calc_rsq_pr(dx_SSE0,dy_SSE0,dz_SSE0);
+            rsq_SSE2           = gmx_calc_rsq_pr(dx_SSE2,dy_SSE2,dz_SSE2);
+
+            wco_SSE0           = gmx_cmplt_pr(rsq_SSE0,rc2_SSE);
+            wco_SSE2           = gmx_cmplt_pr(rsq_SSE2,rc2_SSE);
+
+            wco_any_SSE        = gmx_or_pr(wco_SSE0,wco_SSE2);
+
+            InRange            = gmx_movemask_pr(wco_any_SSE);
+
+            *ndistc += 2*GMX_SIMD_WIDTH_HERE;
+        }
+        if (!InRange)
+        {
+            cjl--;
+        }
+    }
+
+    if (cjf <= cjl)
+    {
+        for(cj=cjf; cj<=cjl; cj++)
+        {
+            /* Store cj and the interaction mask */
+            nbl->cj[nbl->ncj].cj   = CI_TO_CJ_SIMD_2XNN(gridj->cell0) + cj;
+            nbl->cj[nbl->ncj].excl = get_imask_x86_simd_2xnn(remove_sub_diag,ci,cj);
+            nbl->ncj++;
+        }
+        /* Increase the closing index in i super-cell list */
+        nbl->ci[nbl->nci].cj_ind_end = nbl->ncj;
+    }
+}
+
+#undef STRIDE_S
+#undef GMX_MM128_HERE
+#undef GMX_MM256_HERE
similarity index 76%
rename from src/mdlib/nbnxn_search_x86_simd.h
rename to src/mdlib/nbnxn_search_simd_4xn.h
index a6af973d8a929d9f2c4ad33a8823a5994b623351..60742fb7196a16f90df3601808f4827995dbd96c 100644 (file)
  * the research papers on the package. Check out http://www.gromacs.org.
  */
 
-/* GMX_MM128_HERE or GMX_MM256_HERE should be set before including this file.
- * gmx_sse_or_avh.h should be included before including this file.
- */
-
-/* Copies PBC shifted i-cell packed atom coordinates to working array */
-#ifdef GMX_MM128_HERE
-static void icell_set_x_x86_simd128
+#if GMX_NBNXN_SIMD_BITWIDTH == 128
+#define GMX_MM128_HERE
 #else
-#ifdef GMX_MM256_HERE
-static void icell_set_x_x86_simd256
+#if GMX_NBNXN_SIMD_BITWIDTH == 256
+#define GMX_MM256_HERE
 #else
-"error: GMX_MM128_HERE or GMX_MM256_HERE not defined"
+#error "unsupported GMX_NBNXN_SIMD_BITWIDTH"
 #endif
 #endif
-                                   (int ci,
-                                    real shx,real shy,real shz,
-                                    int na_c,
-                                    int stride,const real *x,
-                                    nbnxn_list_work_t *work)
-{
-    int  ia;
-#ifdef GMX_MM128_HERE
-    nbnxn_x_ci_x86_simd128_t *x_ci;
-
-    x_ci = work->x_ci_x86_simd128;
+#include "gmx_simd_macros.h"
 
-    ia = X_IND_CI_S128(ci);
+#if GMX_SIMD_WIDTH_HERE >= NBNXN_CPU_CLUSTER_I_SIZE
+#define STRIDE_S  (GMX_SIMD_WIDTH_HERE)
 #else
-    nbnxn_x_ci_x86_simd256_t *x_ci;
+#define STRIDE_S  NBNXN_CPU_CLUSTER_I_SIZE
+#endif
+
+/* Copies PBC shifted i-cell packed atom coordinates to working array */
+static gmx_inline void
+icell_set_x_simd_4xn(int ci,
+                     real shx,real shy,real shz,
+                     int na_c,
+                     int stride,const real *x,
+                     nbnxn_list_work_t *work)
+{
+    int  ia;
+    nbnxn_x_ci_simd_4xn_t *x_ci;
 
-    x_ci = work->x_ci_x86_simd256;
+    x_ci = work->x_ci_simd_4xn;
 
-    ia = X_IND_CI_S256(ci);
-#endif
+    ia = X_IND_CI_SIMD_4XN(ci);
 
     x_ci->ix_SSE0 = gmx_set1_pr(x[ia + 0*STRIDE_S    ] + shx);
     x_ci->iy_SSE0 = gmx_set1_pr(x[ia + 1*STRIDE_S    ] + shy);
@@ -85,34 +82,21 @@ static void icell_set_x_x86_simd256
     x_ci->iz_SSE3 = gmx_set1_pr(x[ia + 2*STRIDE_S + 3] + shz);
 }
 
-/* SSE or AVX code for making a pair list of cell ci vs cell cjf-cjl
+/* SIMD code for making a pair list of cell ci vs cell cjf-cjl
  * for coordinates in packed format.
  * Checks bouding box distances and possibly atom pair distances.
  * This is an accelerated version of make_cluster_list_simple.
  */
-#ifdef GMX_MM128_HERE
-static void make_cluster_list_x86_simd128
-#else
-#ifdef GMX_MM256_HERE
-static void make_cluster_list_x86_simd256
-#else
-"error: GMX_MM128_HERE or GMX_MM256_HERE not defined"
-#endif
-#endif
-                                         (const nbnxn_grid_t *gridj,
-                                          nbnxn_pairlist_t *nbl,
-                                          int ci,int cjf,int cjl,
-                                          gmx_bool remove_sub_diag,
-                                          const real *x_j,
-                                          real rl2,float rbb2,
-                                          int *ndistc)
+static gmx_inline void
+make_cluster_list_simd_4xn(const nbnxn_grid_t *gridj,
+                           nbnxn_pairlist_t *nbl,
+                           int ci,int cjf,int cjl,
+                           gmx_bool remove_sub_diag,
+                           const real *x_j,
+                           real rl2,float rbb2,
+                           int *ndistc)
 {
-#ifdef GMX_MM128_HERE
-    const nbnxn_x_ci_x86_simd128_t *work;
-#else
-    const nbnxn_x_ci_x86_simd256_t *work;
-#endif
-
+    const nbnxn_x_ci_simd_4xn_t *work;
     const float *bb_ci;
 
     gmx_mm_pr  jx_SSE,jy_SSE,jz_SSE;
@@ -139,17 +123,10 @@ static void make_cluster_list_x86_simd256
     float      d2;
     int        xind_f,xind_l,cj;
 
-#ifdef GMX_MM128_HERE
-    cjf = CI_TO_CJ_S128(cjf);
-    cjl = CI_TO_CJ_S128(cjl+1) - 1;
-
-    work = nbl->work->x_ci_x86_simd128;
-#else
-    cjf = CI_TO_CJ_S256(cjf);
-    cjl = CI_TO_CJ_S256(cjl+1) - 1;
+    cjf = CI_TO_CJ_SIMD_4XN(cjf);
+    cjl = CI_TO_CJ_SIMD_4XN(cjl+1) - 1;
 
-    work = nbl->work->x_ci_x86_simd256;
-#endif
+    work = nbl->work->x_ci_simd_4xn;
 
     bb_ci = nbl->work->bb_ci;
 
@@ -172,11 +149,8 @@ static void make_cluster_list_x86_simd256
         }
         else if (d2 < rl2)
         {
-#ifdef GMX_MM128_HERE
-            xind_f  = X_IND_CJ_S128(CI_TO_CJ_S128(gridj->cell0) + cjf);
-#else
-            xind_f  = X_IND_CJ_S256(CI_TO_CJ_S256(gridj->cell0) + cjf);
-#endif
+            xind_f  = X_IND_CJ_SIMD_4XN(CI_TO_CJ_SIMD_4XN(gridj->cell0) + cjf);
+
             jx_SSE  = gmx_load_pr(x_j+xind_f+0*STRIDE_S);
             jy_SSE  = gmx_load_pr(x_j+xind_f+1*STRIDE_S);
             jz_SSE  = gmx_load_pr(x_j+xind_f+2*STRIDE_S);
@@ -213,7 +187,7 @@ static void make_cluster_list_x86_simd256
             
             InRange            = gmx_movemask_pr(wco_any_SSE);
 
-            *ndistc += 4*GMX_X86_SIMD_WIDTH_HERE;
+            *ndistc += 4*GMX_SIMD_WIDTH_HERE;
         }
         if (!InRange)
         {
@@ -242,11 +216,8 @@ static void make_cluster_list_x86_simd256
         }
         else if (d2 < rl2)
         {
-#ifdef GMX_MM128_HERE
-            xind_l  = X_IND_CJ_S128(CI_TO_CJ_S128(gridj->cell0) + cjl);
-#else
-            xind_l  = X_IND_CJ_S256(CI_TO_CJ_S256(gridj->cell0) + cjl);
-#endif
+            xind_l  = X_IND_CJ_SIMD_4XN(CI_TO_CJ_SIMD_4XN(gridj->cell0) + cjl);
+
             jx_SSE  = gmx_load_pr(x_j+xind_l+0*STRIDE_S);
             jy_SSE  = gmx_load_pr(x_j+xind_l+1*STRIDE_S);
             jz_SSE  = gmx_load_pr(x_j+xind_l+2*STRIDE_S);
@@ -282,7 +253,7 @@ static void make_cluster_list_x86_simd256
             
             InRange            = gmx_movemask_pr(wco_any_SSE);
 
-            *ndistc += 4*GMX_X86_SIMD_WIDTH_HERE;
+            *ndistc += 4*GMX_SIMD_WIDTH_HERE;
         }
         if (!InRange)
         {
@@ -295,16 +266,15 @@ static void make_cluster_list_x86_simd256
         for(cj=cjf; cj<=cjl; cj++)
         {
             /* Store cj and the interaction mask */
-#ifdef GMX_MM128_HERE
-            nbl->cj[nbl->ncj].cj   = CI_TO_CJ_S128(gridj->cell0) + cj;
-            nbl->cj[nbl->ncj].excl = get_imask_x86_simd128(remove_sub_diag,ci,cj);
-#else
-            nbl->cj[nbl->ncj].cj   = CI_TO_CJ_S256(gridj->cell0) + cj;
-            nbl->cj[nbl->ncj].excl = get_imask_x86_simd256(remove_sub_diag,ci,cj);
-#endif
+            nbl->cj[nbl->ncj].cj   = CI_TO_CJ_SIMD_4XN(gridj->cell0) + cj;
+            nbl->cj[nbl->ncj].excl = get_imask_x86_simd_4xn(remove_sub_diag,ci,cj);
             nbl->ncj++;
         }
         /* Increase the closing index in i super-cell list */
         nbl->ci[nbl->nci].cj_ind_end = nbl->ncj;
     }
 }
+
+#undef STRIDE_S
+#undef GMX_MM128_HERE
+#undef GMX_MM256_HERE
index b57d35cf4d74830a0c86247c1e53415d6d7f370b..eb9f636140116ec2adad8531460842b5057c0013 100644 (file)
@@ -93,8 +93,8 @@
 #include "nbnxn_atomdata.h"
 #include "nbnxn_search.h"
 #include "nbnxn_kernels/nbnxn_kernel_ref.h"
-#include "nbnxn_kernels/nbnxn_kernel_x86_simd128.h"
-#include "nbnxn_kernels/nbnxn_kernel_x86_simd256.h"
+#include "nbnxn_kernels/nbnxn_kernel_simd_4xn.h"
+#include "nbnxn_kernels/nbnxn_kernel_simd_2xnn.h"
 #include "nbnxn_kernels/nbnxn_kernel_gpu_ref.h"
 
 #ifdef GMX_LIB_MPI
@@ -620,13 +620,13 @@ static void do_nb_verlet(t_forcerec *fr,
         gmx_incons("Invalid cut-off scheme passed!");
     }
 
-    if (nbvg->kernel_type != nbk8x8x8_CUDA)
+    if (nbvg->kernel_type != nbnxnk8x8x8_CUDA)
     {
         wallcycle_sub_start(wcycle, ewcsNONBONDED);
     }
     switch (nbvg->kernel_type)
     {
-        case nbk4x4_PlainC:
+        case nbnxnk4x4_PlainC:
             nbnxn_kernel_ref(&nbvg->nbl_lists,
                              nbvg->nbat, ic,
                              fr->shift_vec,
@@ -639,38 +639,38 @@ static void do_nb_verlet(t_forcerec *fr,
                              enerd->grpp.ener[egLJSR]);
             break;
         
-        case nbk4xN_X86_SIMD128:
-            nbnxn_kernel_x86_simd128(&nbvg->nbl_lists,
-                                     nbvg->nbat, ic,
-                                     nbvg->ewald_excl,
-                                     fr->shift_vec,
-                                     flags,
-                                     clearF,
-                                     fr->fshift[0],
-                                     enerd->grpp.ener[egCOULSR],
-                                     fr->bBHAM ?
-                                     enerd->grpp.ener[egBHAMSR] :
-                                     enerd->grpp.ener[egLJSR]);
+        case nbnxnk4xN_SIMD_4xN:
+            nbnxn_kernel_simd_4xn(&nbvg->nbl_lists,
+                                  nbvg->nbat, ic,
+                                  nbvg->ewald_excl,
+                                  fr->shift_vec,
+                                  flags,
+                                  clearF,
+                                  fr->fshift[0],
+                                  enerd->grpp.ener[egCOULSR],
+                                  fr->bBHAM ?
+                                  enerd->grpp.ener[egBHAMSR] :
+                                  enerd->grpp.ener[egLJSR]);
             break;
-        case nbk4xN_X86_SIMD256:
-            nbnxn_kernel_x86_simd256(&nbvg->nbl_lists,
-                                     nbvg->nbat, ic,
-                                     nbvg->ewald_excl,
-                                     fr->shift_vec,
-                                     flags,
-                                     clearF,
-                                     fr->fshift[0],
-                                     enerd->grpp.ener[egCOULSR],
-                                     fr->bBHAM ?
-                                     enerd->grpp.ener[egBHAMSR] :
-                                     enerd->grpp.ener[egLJSR]);
+        case nbnxnk4xN_SIMD_2xNN:
+            nbnxn_kernel_simd_2xnn(&nbvg->nbl_lists,
+                                   nbvg->nbat, ic,
+                                   nbvg->ewald_excl,
+                                   fr->shift_vec,
+                                   flags,
+                                   clearF,
+                                   fr->fshift[0],
+                                   enerd->grpp.ener[egCOULSR],
+                                   fr->bBHAM ?
+                                   enerd->grpp.ener[egBHAMSR] :
+                                   enerd->grpp.ener[egLJSR]);
             break;
 
-        case nbk8x8x8_CUDA:
+        case nbnxnk8x8x8_CUDA:
             nbnxn_cuda_launch_kernel(fr->nbv->cu_nbv, nbvg->nbat, flags, ilocality);
             break;
 
-        case nbk8x8x8_PlainC:
+        case nbnxnk8x8x8_PlainC:
             nbnxn_kernel_gpu_ref(nbvg->nbl_lists.nbl[0],
                                  nbvg->nbat, ic,
                                  fr->shift_vec,
@@ -688,7 +688,7 @@ static void do_nb_verlet(t_forcerec *fr,
             gmx_incons("Invalid nonbonded kernel type passed!");
 
     }
-    if (nbvg->kernel_type != nbk8x8x8_CUDA)
+    if (nbvg->kernel_type != nbnxnk8x8x8_CUDA)
     {
         wallcycle_sub_stop(wcycle, ewcsNONBONDED);
     }
@@ -785,7 +785,7 @@ void do_force_cutsVERLET(FILE *fplog,t_commrec *cr,
     bDoForces     = (flags & GMX_FORCE_FORCES);
     bSepLRF       = (bDoLongRange && bDoForces && (flags & GMX_FORCE_SEPLRF));
     bUseGPU       = fr->nbv->bUseGPU;
-    bUseOrEmulGPU = bUseGPU || (nbv->grp[0].kernel_type == nbk8x8x8_PlainC);
+    bUseOrEmulGPU = bUseGPU || (nbv->grp[0].kernel_type == nbnxnk8x8x8_PlainC);
 
     if (bStateChanged)
     {
@@ -1000,7 +1000,7 @@ void do_force_cutsVERLET(FILE *fplog,t_commrec *cr,
 
             wallcycle_sub_stop(wcycle,ewcsNBS_SEARCH_NONLOCAL);
 
-            if (nbv->grp[eintNonlocal].kernel_type == nbk8x8x8_CUDA)
+            if (nbv->grp[eintNonlocal].kernel_type == nbnxnk8x8x8_CUDA)
             {
                 /* initialize non-local pair-list on the GPU */
                 nbnxn_cuda_init_pairlist(nbv->cu_nbv,