+++ /dev/null
-#
-# This file is part of the GROMACS molecular simulation package.
-#
-# Copyright (c) 2012, by the GROMACS development team, led by
-# David van der Spoel, Berk Hess, Erik Lindahl, and including many
-# others, as listed in the AUTHORS file in the top-level source
-# directory and at http://www.gromacs.org.
-#
-# GROMACS is free software; you can redistribute it and/or
-# modify it under the terms of the GNU Lesser General Public License
-# as published by the Free Software Foundation; either version 2.1
-# of the License, or (at your option) any later version.
-#
-# GROMACS is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-# Lesser General Public License for more details.
-#
-# You should have received a copy of the GNU Lesser General Public
-# License along with GROMACS; if not, see
-# http://www.gnu.org/licenses, or write to the Free Software Foundation,
-# Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-#
-# If you want to redistribute modifications to GROMACS, please
-# consider that scientific software is very special. Version
-# control is crucial - bugs must be traceable. We will be happy to
-# consider code for inclusion in the official distribution, but
-# derived work must not be called official GROMACS. Details are found
-# in the README & COPYING files - if they are missing, get the
-# official version at http://www.gromacs.org.
-#
-# To help us fund GROMACS development, we humbly ask that you cite
-# the research papers on the package. Check out http://www.gromacs.org.
-#
-# Check GCC version and if any of the 4.1.x family compiler suites is found
-# quit the build system generating process.
-#
-# The GCC 4.1.x compilers contain an optimization related bug which might
-# results in code that exhibits incorrect behaviour and often leads to
-# exploding systems or crashes.
-#
-# For further details see e.g.
-# https://bugs.launchpad.net/ubuntu/+source/gcc-4.1/+bug/158799
-#
-# Szilard Pall (pszilard@cbr.su.se)
-#
-
-if(NOT GMX_DISABLE_GCC41_CHECK)
-
-if(CMAKE_COMPILER_IS_GNUCC)
- # if we have -dumpversion flag use that, otherwise try the --version
- execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion
- RESULT_VARIABLE _gcc_dumpversion_res
- OUTPUT_VARIABLE _gcc_dumpversion_out
- OUTPUT_STRIP_TRAILING_WHITESPACE)
- # if gcc returned with error the -dumpversion is not available
- if(${_gcc_dumpversion_res} EQUAL 0)
- if(${_gcc_dumpversion_out} MATCHES ".*4\\.1\\.[0-9]+.*")
- message(FATAL_ERROR " The GCC compiler in use seems to belong to the 4.1.x
- family (detected version: ${_gcc_dumpversion_out}). These compilers
- contain an optimization related bug which might results in code that
- exhibits incorrect behaviour and often leads to exploding systems or
- crashes. To disable this check set GMX_DISABLE_GCC41_CHECK=YES.")
- endif()
- else()
- message(WARNING " The GCC compiler in use does not support the -dumpversion flag.
- Will attempt parsing the version from the \"gcc --version\" output.")
- execute_process(COMMAND ${CMAKE_C_COMPILER} --version
- OUTPUT_VARIABLE _gcc_version_out
- OUTPUT_STRIP_TRAILING_WHITESPACE)
- if("${_gcc_version_out}" MATCHES ".*4\\.1\\.[0-9]+.*")
- message(FATAL_ERROR " The GCC compiler in use seems to belong to the 4.1.x
- family. These compiler compilers contain an optimization related bug
- which might results in code that exhibits incorrect behaviour and
- often leads to exploding systems or crashes. To disable this check set
- GMX_DISABLE_GCC41_CHECK=YES.")
- endif()
- endif()
-endif()
-
-endif()
};
static const char *
-GPLText[] = {
+LicenseText[] = {
"This program is free software; you can redistribute it and/or",
- "modify it under the terms of the GNU General Public License",
- "as published by the Free Software Foundation; either version 2",
+ "modify it under the terms of the GNU Lesser General Public License",
+ "as published by the Free Software Foundation; either version 2.1",
"of the License, or (at your option) any later version."
};
* the research papers on the package. Check out http://www.gromacs.org.
*/
+/* The macros in this file are intended to be used for writing
+ * architecture independent SIMD intrinsics code.
+ * To support a new architecture, adding macros here should be (nearly)
+ * all that is needed.
+ */
+
/* Undefine all defines used below so we can include this file multiple times
* with different settings from the same source file.
*/
/* NOTE: floor and blend are NOT available with SSE2 only acceleration */
-#undef GMX_X86_SIMD_WIDTH_HERE
+#undef GMX_SIMD_WIDTH_HERE
#undef gmx_epi32
*/
#if !defined GMX_MM128_HERE && !defined GMX_MM256_HERE
-"You should define GMX_MM128_HERE or GMX_MM256_HERE"
+#error "You should define GMX_MM128_HERE or GMX_MM256_HERE"
#endif
#if defined GMX_MM128_HERE && defined GMX_MM256_HERE
-"You should not define both GMX_MM128_HERE and GMX_MM256_HERE"
+#error "You should not define both GMX_MM128_HERE and GMX_MM256_HERE"
#endif
#ifdef GMX_MM128_HERE
#include "gmx_x86_simd_single.h"
-#define GMX_X86_SIMD_WIDTH_HERE 4
+#define GMX_SIMD_WIDTH_HERE 4
#define gmx_mm_pr __m128
#include "gmx_x86_simd_double.h"
-#define GMX_X86_SIMD_WIDTH_HERE 2
+#define GMX_SIMD_WIDTH_HERE 2
#define gmx_mm_pr __m128d
#include "gmx_x86_simd_single.h"
-#define GMX_X86_SIMD_WIDTH_HERE 8
+#define GMX_SIMD_WIDTH_HERE 8
#define gmx_mm_pr __m256
#define gmx_pmecorrF_pr gmx_mm256_pmecorrF_ps
#define gmx_pmecorrV_pr gmx_mm256_pmecorrV_ps
+#define gmx_loaddh_pr gmx_mm256_load4_ps
+
+/* Half SIMD-width type */
+#define gmx_mm_hpr __m128
+
+/* Half SIMD-width macros */
+#define gmx_load_hpr _mm_load_ps
+#define gmx_load1_hpr(x) _mm_set1_ps((x)[0])
+#define gmx_store_hpr _mm_store_ps
+#define gmx_add_hpr _mm_add_ps
+#define gmx_sub_hpr _mm_sub_ps
+
+#define gmx_sum4_hpr gmx_mm256_sum4h_m128
+
+/* Conversion between half and full SIMD-width */
+#define gmx_2hpr_to_pr gmx_mm256_set_m128
+
#else
#include "gmx_x86_simd_double.h"
-#define GMX_X86_SIMD_WIDTH_HERE 4
+#define GMX_SIMD_WIDTH_HERE 4
#define gmx_mm_pr __m256d
}
+static gmx_inline __m256
+gmx_mm256_load4_ps(float const * p)
+{
+ __m128 a;
+
+ a = _mm_load_ps(p);
+ return _mm256_insertf128_ps(_mm256_castps128_ps256(a), a, 0x1);
+}
+
+
static __m256d
gmx_mm256_unpack128lo_pd(__m256d xmm1, __m256d xmm2)
{
}
+static __m128 gmx_mm256_sum4h_m128(__m256 x, __m256 y)
+{
+ __m256 sum;
+
+ sum = _mm256_add_ps(x,y);
+ return _mm_add_ps(_mm256_castps256_ps128(sum),_mm256_extractf128_ps(sum,0x1));
+}
static void
extern "C" {
#endif
-/*! Nonbonded NxN kernel types: plain C, SSE/AVX, GPU CUDA, GPU emulation, etc */
-enum { nbkNotSet = 0,
- nbk4x4_PlainC,
- nbk4xN_X86_SIMD128,
- nbk4xN_X86_SIMD256,
- nbk8x8x8_CUDA,
- nbk8x8x8_PlainC };
+#ifdef GMX_X86_SSE2
+/* Use SIMD accelerated nbnxn search and kernels */
+#define GMX_NBNXN_SIMD
+
+#ifdef GMX_X86_AVX_256
+/* Comment out this define to use AVX-128 kernels with AVX-256 acceleration */
+#define GMX_NBNXN_SIMD_BITWIDTH 256
+#else
+#define GMX_NBNXN_SIMD_BITWIDTH 128
+#endif
+
+/* The nbnxn SIMD 4xN and 2x(N+N) kernels can be added independently.
+ * Currently the 2xNN SIMD kernels only make sense and are only implemented
+ * with AVX-256 in single precision using a 4x4 cluster setup instead of 4x8.
+ */
+#define GMX_NBNXN_SIMD_4XN
+#if GMX_NBNXN_SIMD_BITWIDTH == 256 && !defined GMX_DOUBLE
+#define GMX_NBNXN_SIMD_2XNN
+#endif
+
+#endif
+
+
+/*! Nonbonded NxN kernel types: plain C, CPU SIMD, GPU CUDA, GPU emulation */
+typedef enum
+{
+ nbnxnkNotSet = 0,
+ nbnxnk4x4_PlainC,
+ nbnxnk4xN_SIMD_4xN,
+ nbnxnk4xN_SIMD_2xNN,
+ nbnxnk8x8x8_CUDA,
+ nbnxnk8x8x8_PlainC,
+ nbnxnkNR
+} nbnxn_kernel_type;
/* Note that _mm_... intrinsics can be converted to either SSE or AVX
* depending on compiler flags.
* For gcc we check for __AVX__
* At least a check for icc should be added (if there is a macro)
*/
-static const char *nbk_name[] =
- { "not set", "plain C 4x4",
-#if !(defined GMX_X86_AVX_256 || defined GMX_X86_AVX128_FMA || defined __AVX__)
+static const char *nbnxn_kernel_name[nbnxnkNR] =
+ { "not set", "plain C",
+#if !(defined GMX_X86_SSE2)
+ "not available", "not available",
+#else
+#if GMX_NBNXN_SIMD_BITWIDTH == 128
+#if !(defined GMX_X86_AVX_128_FMA || defined __AVX__)
#ifndef GMX_X86_SSE4_1
-#ifndef GMX_DOUBLE
- "SSE2 4x4",
+ "SSE2", "SSE2",
#else
- "SSE2 4x2",
+ "SSE4.1", "SSE4.1",
#endif
#else
-#ifndef GMX_DOUBLE
- "SSE4.1 4x4",
-#else
- "SSE4.1 4x2",
+ "AVX-128", "AVX-128",
#endif
-#endif
-#else
-#ifndef GMX_DOUBLE
- "AVX-128 4x4",
#else
- "AVX-128 4x2",
+ "AVX-256", "AVX-256",
#endif
#endif
-#ifndef GMX_DOUBLE
- "AVX-256 4x8",
-#else
- "AVX-256 4x4",
-#endif
- "CUDA 8x8x8", "plain C 8x8x8" };
+ "CUDA", "plain C" };
enum { ewaldexclTable, ewaldexclAnalytical };
/* non-bonded data structure with Verlet-type cut-off */
typedef struct {
- nbnxn_search_t nbs; /* n vs n atom pair searching data */
- int ngrp; /* number of interaction groups */
- nonbonded_verlet_group_t grp[2];/* local and non-local interaction group */
+ nbnxn_search_t nbs; /* n vs n atom pair searching data */
+ int ngrp; /* number of interaction groups */
+ nonbonded_verlet_group_t grp[2];/* local and non-local interaction group */
gmx_bool bUseGPU; /* TRUE when GPU acceleration is used */
nbnxn_cuda_ptr_t cu_nbv; /* pointer to CUDA nb verlet data */
* name of a file. Otherwise, we won't be able to find the library dir.
*/
#define NCR (int)asize(CopyrightText)
+/* TODO: Is this exception still needed? */
#ifdef GMX_FAHCORE
-#define NGPL 0 /*FAH has an exception permission from GPL to allow digital signatures in Gromacs*/
+#define NLICENSE 0 /*FAH has an exception permission from GPL to allow digital signatures in Gromacs*/
#else
-#define NGPL (int)asize(GPLText)
+#define NLICENSE (int)asize(LicenseText)
#endif
char buf[256],tmpstr[1024];
for(i=0; (i<NCR); i++)
sp_print(out,CopyrightText[i]);
- for(i=0; (i<NGPL); i++)
- sp_print(out,GPLText[i]);
+ for(i=0; (i<NLICENSE); i++)
+ sp_print(out,LicenseText[i]);
fprintf(out,"\n");
* ridiculous number. */
static unsigned int max_gpu_ids_user = 64;
+static const char* invalid_gpuid_hint =
+ "A delimiter-free sequence of valid numeric IDs of available GPUs is expected.";
+
/* FW decl. */
void limit_num_gpus_used(gmx_hw_info_t *hwinfo, int count);
{
if (idstr[i] < '0' || idstr[i] > '9')
{
- gmx_fatal(FARGS, "Invalid character in GPU ID string: '%c'\n", idstr[i]);
+ gmx_fatal(FARGS, "Invalid character in GPU ID string: '%c'\n%s\n",
+ invalid_gpuid_hint, idstr[i]);
}
idlist[i] = idstr[i] - '0';
}
bGPUBin = FALSE;
#endif
- /* Bail if binary is not compiled with GPU on */
+ /* Bail if binary is not compiled with GPU acceleration, but this is either
+ * explicitly (-nb gpu) or implicitly (gpu ID passed) requested. */
if (bForceUseGPU && !bGPUBin)
{
- gmx_fatal_collective(FARGS, cr, NULL, "GPU acceleration requested, but %s was compiled without GPU support!", ShortProgram());
+ gmx_fatal(FARGS, "GPU acceleration requested, but %s was compiled without GPU support!", ShortProgram());
+ }
+ if (gpu_id != NULL && !bGPUBin)
+ {
+ gmx_fatal(FARGS, "GPU ID string set, but %s was compiled without GPU support!", ShortProgram());
}
/* run the detection if the binary was compiled with GPU support */
if (nid == 0)
{
- gmx_fatal(FARGS, "Empty GPU ID string passed\n");
+ gmx_fatal(FARGS, "Empty GPU ID string encountered.\n%s\n", invalid_gpuid_hint);
}
res = check_select_cuda_gpus(checkres, &hwinfo->gpu_info, gpuid, nid);
}
else
{
-#ifndef GMX_X86_SSE2
+#ifndef GMX_NBNXN_SIMD
list_setup->cluster_size_j = NBNXN_CPU_CLUSTER_I_SIZE;
#else
- int simd_width;
-
-#ifdef GMX_X86_AVX_256
- simd_width = 256;
-#else
- simd_width = 128;
+ list_setup->cluster_size_j = GMX_NBNXN_SIMD_BITWIDTH/(sizeof(real)*8);
+#ifdef GMX_NBNXN_SIMD_2XNN
+ /* We assume the smallest cluster size to be on the safe side */
+ list_setup->cluster_size_j /= 2;
#endif
- list_setup->cluster_size_j = simd_width/(sizeof(real)*8);
#endif
}
}
ic->ewaldcoeff = set->ewaldcoeff;
bUsesSimpleTables = uses_simple_tables(ir->cutoff_scheme, nbv, 0);
- if (pme_lb->cutoff_scheme == ecutsVERLET && nbv->grp[0].kernel_type == nbk8x8x8_CUDA)
+ if (pme_lb->cutoff_scheme == ecutsVERLET &&
+ nbv->grp[0].kernel_type == nbnxnk8x8x8_CUDA)
{
nbnxn_cuda_pme_loadbal_update_param(nbv->cu_nbv,ic);
}
static void pick_nbnxn_kernel_cpu(FILE *fp,
const t_commrec *cr,
const gmx_cpuid_t cpuid_info,
+ const t_inputrec *ir,
int *kernel_type,
int *ewald_excl)
{
- *kernel_type = nbk4x4_PlainC;
+ *kernel_type = nbnxnk4x4_PlainC;
*ewald_excl = ewaldexclTable;
-#ifdef GMX_X86_SSE2
+#ifdef GMX_NBNXN_SIMD
{
- /* On Intel Sandy-Bridge AVX-256 kernels are always faster.
- * On AMD Bulldozer AVX-256 is much slower than AVX-128.
- */
- if(gmx_cpuid_feature(cpuid_info, GMX_CPUID_FEATURE_X86_AVX) == 1 &&
- gmx_cpuid_vendor(cpuid_info) != GMX_CPUID_VENDOR_AMD)
- {
-#ifdef GMX_X86_AVX_256
- *kernel_type = nbk4xN_X86_SIMD256;
-#else
- *kernel_type = nbk4xN_X86_SIMD128;
+#ifdef GMX_NBNXN_SIMD_4XN
+ *kernel_type = nbnxnk4xN_SIMD_4xN;
#endif
- }
- else
+#ifdef GMX_NBNXN_SIMD_2XNN
+ /* We expect the 2xNN kernels to be faster in most cases */
+ *kernel_type = nbnxnk4xN_SIMD_2xNN;
+#endif
+
+#if defined GMX_NBNXN_SIMD_4XN && defined GMX_X86_AVX_256
+ if (EEL_RF(ir->coulombtype) || ir->coulombtype == eelCUT)
{
- *kernel_type = nbk4xN_X86_SIMD128;
+ /* The raw pair rate of the 4x8 kernel is higher than 2x(4+4),
+ * 10% with HT, 50% without HT, but extra zeros interactions
+ * can compensate. As we currently don't detect the actual use
+ * of HT, switch to 4x8 to avoid a potential performance hit.
+ */
+ *kernel_type = nbnxnk4xN_SIMD_4xN;
}
-
- if (getenv("GMX_NBNXN_AVX128") != NULL)
+#endif
+ if (getenv("GMX_NBNXN_SIMD_4XN") != NULL)
{
- *kernel_type = nbk4xN_X86_SIMD128;
+#ifdef GMX_NBNXN_SIMD_2XNN
+ *kernel_type = nbnxnk4xN_SIMD_4xN;
+#else
+ gmx_fatal(FARGS,"SIMD 4xN kernels requested, but Gromacs has been compiled without support for these kernels");
+#endif
}
- if (getenv("GMX_NBNXN_AVX256") != NULL)
+ if (getenv("GMX_NBNXN_SIMD_2XNN") != NULL)
{
-#ifdef GMX_X86_AVX_256
- *kernel_type = nbk4xN_X86_SIMD256;
+#ifdef GMX_NBNXN_SIMD_2XNN
+ *kernel_type = nbnxnk4xN_SIMD_2xNN;
#else
- gmx_fatal(FARGS,"You requested AVX-256 nbnxn kernels, but GROMACS was built without AVX support");
+ gmx_fatal(FARGS,"SIMD 2x(N+N) kernels requested, but Gromacs has been compiled without support for these kernels");
#endif
}
const gmx_hw_info_t *hwinfo,
gmx_bool use_cpu_acceleration,
gmx_bool *bUseGPU,
+ const t_inputrec *ir,
int *kernel_type,
int *ewald_excl,
gmx_bool bDoNonbonded)
assert(kernel_type);
- *kernel_type = nbkNotSet;
+ *kernel_type = nbnxnkNotSet;
*ewald_excl = ewaldexclTable;
bEmulateGPUEnvVarSet = (getenv("GMX_EMULATE_GPU") != NULL);
if (bEmulateGPU)
{
- *kernel_type = nbk8x8x8_PlainC;
+ *kernel_type = nbnxnk8x8x8_PlainC;
if (bDoNonbonded)
{
}
else if (bGPU)
{
- *kernel_type = nbk8x8x8_CUDA;
+ *kernel_type = nbnxnk8x8x8_CUDA;
}
- if (*kernel_type == nbkNotSet)
+ if (*kernel_type == nbnxnkNotSet)
{
if (use_cpu_acceleration)
{
- pick_nbnxn_kernel_cpu(fp,cr,hwinfo->cpuid_info,
+ pick_nbnxn_kernel_cpu(fp,cr,hwinfo->cpuid_info,ir,
kernel_type,ewald_excl);
}
else
{
- *kernel_type = nbk4x4_PlainC;
+ *kernel_type = nbnxnk4x4_PlainC;
}
}
if (bDoNonbonded && fp != NULL)
{
- if (MASTER(cr))
- {
- fprintf(stderr,"Using %s non-bonded kernels\n",
- nbk_name[*kernel_type]);
- }
- fprintf(fp,"\nUsing %s non-bonded kernels\n\n",
- nbk_name[*kernel_type]);
+ fprintf(fp,"\nUsing %s %dx%d non-bonded kernels\n\n",
+ nbnxn_kernel_name[*kernel_type],
+ nbnxn_kernel_pairlist_simple(*kernel_type) ? NBNXN_CPU_CLUSTER_I_SIZE : NBNXN_GPU_CLUSTER_SIZE,
+ nbnxn_kernel_to_cj_size(*kernel_type));
}
}
{
nbv->grp[i].nbl_lists.nnbl = 0;
nbv->grp[i].nbat = NULL;
- nbv->grp[i].kernel_type = nbkNotSet;
+ nbv->grp[i].kernel_type = nbnxnkNotSet;
if (i == 0) /* local */
{
pick_nbnxn_kernel(fp, cr, fr->hwinfo, fr->use_cpu_acceleration,
&nbv->bUseGPU,
+ ir,
&nbv->grp[i].kernel_type,
&nbv->grp[i].ewald_excl,
fr->bNonbonded);
/* Use GPU for local, select a CPU kernel for non-local */
pick_nbnxn_kernel(fp, cr, fr->hwinfo, fr->use_cpu_acceleration,
NULL,
+ ir,
&nbv->grp[i].kernel_type,
&nbv->grp[i].ewald_excl,
fr->bNonbonded);
for(i=0; i<nbv->ngrp; i++)
{
- if (nbv->grp[0].kernel_type == nbk8x8x8_CUDA)
+ if (nbv->grp[0].kernel_type == nbnxnk8x8x8_CUDA)
{
nb_alloc = &pmalloc;
nb_free = &pfree;
ma((void **)&out->Vvdw,out->nV*sizeof(*out->Vvdw));
ma((void **)&out->Vc ,out->nV*sizeof(*out->Vc ));
- if (nb_kernel_type == nbk4xN_X86_SIMD128 ||
- nb_kernel_type == nbk4xN_X86_SIMD256)
+ if (nb_kernel_type == nbnxnk4xN_SIMD_4xN ||
+ nb_kernel_type == nbnxnk4xN_SIMD_2xNN)
{
cj_size = nbnxn_kernel_to_cj_size(nb_kernel_type);
out->nVS = nenergrp*nenergrp*stride*(cj_size>>1)*cj_size;
nbat->lj_comb = NULL;
if (simple)
{
+ int pack_x;
+
switch (nb_kernel_type)
{
- case nbk4xN_X86_SIMD128:
- nbat->XFormat = nbatX4;
- break;
- case nbk4xN_X86_SIMD256:
-#ifndef GMX_DOUBLE
- nbat->XFormat = nbatX8;
-#else
- nbat->XFormat = nbatX4;
-#endif
+ case nbnxnk4xN_SIMD_4xN:
+ case nbnxnk4xN_SIMD_2xNN:
+ pack_x = max(NBNXN_CPU_CLUSTER_I_SIZE,
+ nbnxn_kernel_to_cj_size(nb_kernel_type));
+ switch (pack_x)
+ {
+ case 4:
+ nbat->XFormat = nbatX4;
+ break;
+ case 8:
+ nbat->XFormat = nbatX8;
+ break;
+ default:
+ gmx_incons("Unsupported packing width");
+ }
break;
default:
nbat->XFormat = nbatXYZ;
#else
#define GMX_MM128_HERE
#endif
-#include "gmx_x86_simd_macros.h"
+#include "gmx_simd_macros.h"
int i,s;
gmx_mm_pr dest_SSE,src_SSE;
if (bDestSet)
{
- for(i=i0; i<i1; i+=GMX_X86_SIMD_WIDTH_HERE)
+ for(i=i0; i<i1; i+=GMX_SIMD_WIDTH_HERE)
{
dest_SSE = gmx_load_pr(dest+i);
for(s=0; s<nsrc; s++)
}
else
{
- for(i=i0; i<i1; i+=GMX_X86_SIMD_WIDTH_HERE)
+ for(i=i0; i<i1; i+=GMX_SIMD_WIDTH_HERE)
{
dest_SSE = gmx_load_pr(src[0]+i);
for(s=1; s<nsrc; s++)
int nsubc_tot; /* Total number of subcell, used for printing */
} nbnxn_grid_t;
-#ifdef NBNXN_SEARCH_SSE
+#ifdef GMX_NBNXN_SIMD
+#if GMX_NBNXN_SIMD_BITWIDTH == 128
#define GMX_MM128_HERE
-#include "gmx_x86_simd_macros.h"
-typedef struct nbnxn_x_ci_x86_simd128 {
+#else
+#if GMX_NBNXN_SIMD_BITWIDTH == 256
+#define GMX_MM256_HERE
+#else
+#error "unsupported GMX_NBNXN_SIMD_BITWIDTH"
+#endif
+#endif
+#include "gmx_simd_macros.h"
+
+typedef struct nbnxn_x_ci_simd_4xn {
/* The i-cluster coordinates for simple search */
gmx_mm_pr ix_SSE0,iy_SSE0,iz_SSE0;
gmx_mm_pr ix_SSE1,iy_SSE1,iz_SSE1;
gmx_mm_pr ix_SSE2,iy_SSE2,iz_SSE2;
gmx_mm_pr ix_SSE3,iy_SSE3,iz_SSE3;
-} nbnxn_x_ci_x86_simd128_t;
-#undef GMX_MM128_HERE
-#ifdef GMX_X86_AVX_256
-#define GMX_MM256_HERE
-#include "gmx_x86_simd_macros.h"
-typedef struct nbnxn_x_ci_x86_simd256 {
+} nbnxn_x_ci_simd_4xn_t;
+
+typedef struct nbnxn_x_ci_simd_2xnn {
/* The i-cluster coordinates for simple search */
gmx_mm_pr ix_SSE0,iy_SSE0,iz_SSE0;
- gmx_mm_pr ix_SSE1,iy_SSE1,iz_SSE1;
gmx_mm_pr ix_SSE2,iy_SSE2,iz_SSE2;
- gmx_mm_pr ix_SSE3,iy_SSE3,iz_SSE3;
-} nbnxn_x_ci_x86_simd256_t;
-#undef GMX_MM256_HERE
-#endif
+} nbnxn_x_ci_simd_2xnn_t;
+
#endif
/* Working data for the actual i-supercell during pair search */
float *bb_ci; /* The bounding boxes, pbc shifted, for each cluster */
real *x_ci; /* The coordinates, pbc shifted, for each atom */
-#ifdef NBNXN_SEARCH_SSE
- nbnxn_x_ci_x86_simd128_t *x_ci_x86_simd128;
-#ifdef GMX_X86_AVX_256
- nbnxn_x_ci_x86_simd256_t *x_ci_x86_simd256;
-#endif
+#ifdef GMX_NBNXN_SIMD
+ nbnxn_x_ci_simd_4xn_t *x_ci_simd_4xn;
+ nbnxn_x_ci_simd_2xnn_t *x_ci_simd_2xnn;
#endif
int cj_ind; /* The current cj_ind index for the current list */
int cj4_init; /* The first unitialized cj4 block */
nbnxn_list_work_t *work);
static gmx_icell_set_x_t icell_set_x_simple;
-#ifdef NBNXN_SEARCH_SSE
-static gmx_icell_set_x_t icell_set_x_simple_x86_simd128;
-#ifdef GMX_X86_AVX_256
-static gmx_icell_set_x_t icell_set_x_simple_x86_simd256;
-#endif
+#ifdef GMX_NBNXN_SIMD
+static gmx_icell_set_x_t icell_set_x_simple_simd_4xn;
+static gmx_icell_set_x_t icell_set_x_simple_simd_2xnn;
#endif
static gmx_icell_set_x_t icell_set_x_supersub;
#ifdef NBNXN_SEARCH_SSE
static gmx_icell_set_x_t icell_set_x_supersub_sse8;
#endif
+#undef GMX_MM128_HERE
+#undef GMX_MM256_HERE
+
/* Local cycle count struct for profiling */
typedef struct {
int count;
#include "../nbnxn_consts.h"
#include "nbnxn_kernel_common.h"
-#ifdef GMX_X86_SSE2
+#ifdef GMX_NBNXN_SIMD_2XNN
-#include "nbnxn_kernel_x86_simd128.h"
+#include "nbnxn_kernel_simd_2xnn.h"
-/* Include all flavors of the 128-bit SSE or AVX kernel loops */
+/* Include all flavors of the SSE or AVX 2x(N+N) kernel loops */
+#if GMX_NBNXN_SIMD_BITWIDTH == 128
#define GMX_MM128_HERE
+#else
+#if GMX_NBNXN_SIMD_BITWIDTH == 256
+#define GMX_MM256_HERE
+#else
+#error "unsupported GMX_NBNXN_SIMD_BITWIDTH"
+#endif
+#endif
/* Analytical reaction-field kernels */
#define CALC_COUL_RF
-#include "nbnxn_kernel_x86_simd_includes.h"
+#include "nbnxn_kernel_simd_2xnn_includes.h"
#undef CALC_COUL_RF
#define CALC_COUL_TAB
/* Single cut-off: rcoulomb = rvdw */
-#include "nbnxn_kernel_x86_simd_includes.h"
+#include "nbnxn_kernel_simd_2xnn_includes.h"
/* Twin cut-off: rcoulomb >= rvdw */
#define VDW_CUTOFF_CHECK
-#include "nbnxn_kernel_x86_simd_includes.h"
+#include "nbnxn_kernel_simd_2xnn_includes.h"
#undef VDW_CUTOFF_CHECK
#undef CALC_COUL_TAB
#define CALC_COUL_EWALD
/* Single cut-off: rcoulomb = rvdw */
-#include "nbnxn_kernel_x86_simd_includes.h"
+#include "nbnxn_kernel_simd_2xnn_includes.h"
/* Twin cut-off: rcoulomb >= rvdw */
#define VDW_CUTOFF_CHECK
-#include "nbnxn_kernel_x86_simd_includes.h"
+#include "nbnxn_kernel_simd_2xnn_includes.h"
#undef VDW_CUTOFF_CHECK
#undef CALC_COUL_EWALD
enum { coultRF, coultTAB, coultTAB_TWIN, coultEWALD, coultEWALD_TWIN, coultNR };
-#define NBK_FN(elec,ljcomb) nbnxn_kernel_x86_simd128_##elec##_comb_##ljcomb##_ener
+#define NBK_FN(elec,ljcomb) nbnxn_kernel_simd_2xnn_##elec##_comb_##ljcomb##_ener
static p_nbk_func_ener p_nbk_ener[coultNR][ljcrNR] =
{ { NBK_FN(rf ,geom), NBK_FN(rf ,lb), NBK_FN(rf ,none) },
{ NBK_FN(tab ,geom), NBK_FN(tab ,lb), NBK_FN(tab ,none) },
{ NBK_FN(ewald_twin,geom), NBK_FN(ewald_twin,lb), NBK_FN(ewald_twin,none) } };
#undef NBK_FN
-#define NBK_FN(elec,ljcomb) nbnxn_kernel_x86_simd128_##elec##_comb_##ljcomb##_energrp
+#define NBK_FN(elec,ljcomb) nbnxn_kernel_simd_2xnn_##elec##_comb_##ljcomb##_energrp
static p_nbk_func_ener p_nbk_energrp[coultNR][ljcrNR] =
{ { NBK_FN(rf ,geom), NBK_FN(rf ,lb), NBK_FN(rf ,none) },
{ NBK_FN(tab ,geom), NBK_FN(tab ,lb), NBK_FN(tab ,none) },
{ NBK_FN(ewald_twin,geom), NBK_FN(ewald_twin,lb), NBK_FN(ewald_twin,none) } };
#undef NBK_FN
-#define NBK_FN(elec,ljcomb) nbnxn_kernel_x86_simd128_##elec##_comb_##ljcomb##_noener
+#define NBK_FN(elec,ljcomb) nbnxn_kernel_simd_2xnn_##elec##_comb_##ljcomb##_noener
static p_nbk_func_noener p_nbk_noener[coultNR][ljcrNR] =
{ { NBK_FN(rf ,geom), NBK_FN(rf ,lb), NBK_FN(rf ,none) },
{ NBK_FN(tab ,geom), NBK_FN(tab ,lb), NBK_FN(tab ,none) },
const real *VSvdw,const real *VSc,
real *Vvdw,real *Vc)
{
+ const int simd_width = GMX_SIMD_WIDTH_HERE;
+ const int unrollj_half = GMX_SIMD_WIDTH_HERE/4;
int ng_p2,i,j,j0,j1,c,s;
-#define SIMD_WIDTH (GMX_X86_SIMD_WIDTH_HERE)
-#define SIMD_WIDTH_HALF (GMX_X86_SIMD_WIDTH_HERE/2)
-
ng_p2 = (1<<ng_2log);
/* The size of the x86 SIMD energy group buffer array is:
- * ng*ng*ng_p2*SIMD_WIDTH_HALF*SIMD_WIDTH
+ * ng*ng*ng_p2*unrollj_half*simd_width
*/
for(i=0; i<ng; i++)
{
{
for(j0=0; j0<ng; j0++)
{
- c = ((i*ng + j1)*ng_p2 + j0)*SIMD_WIDTH_HALF*SIMD_WIDTH;
- for(s=0; s<SIMD_WIDTH_HALF; s++)
+ c = ((i*ng + j1)*ng_p2 + j0)*unrollj_half*simd_width/2;
+ for(s=0; s<unrollj_half; s++)
{
Vvdw[i*ng+j0] += VSvdw[c+0];
Vvdw[i*ng+j1] += VSvdw[c+1];
Vc [i*ng+j0] += VSc [c+0];
Vc [i*ng+j1] += VSc [c+1];
- c += SIMD_WIDTH + 2;
+ c += simd_width/2 + 2;
}
}
}
}
}
-#endif /* GMX_X86_SSE2 */
+#endif /* GMX_NBNXN_SIMD_2XNN */
void
-nbnxn_kernel_x86_simd128(nbnxn_pairlist_set_t *nbl_list,
- const nbnxn_atomdata_t *nbat,
- const interaction_const_t *ic,
- int ewald_excl,
- rvec *shift_vec,
- int force_flags,
- int clearF,
- real *fshift,
- real *Vc,
- real *Vvdw)
-#ifdef GMX_X86_SSE2
+nbnxn_kernel_simd_2xnn(nbnxn_pairlist_set_t *nbl_list,
+ const nbnxn_atomdata_t *nbat,
+ const interaction_const_t *ic,
+ int ewald_excl,
+ rvec *shift_vec,
+ int force_flags,
+ int clearF,
+ real *fshift,
+ real *Vc,
+ real *Vvdw)
+#ifdef GMX_NBNXN_SIMD_2XNN
{
int nnbl;
nbnxn_pairlist_t **nbl;
}
#else
{
- gmx_incons("nbnxn_kernel_x86_simd128 called while GROMACS was configured without SSE enabled");
+ gmx_incons("nbnxn_kernel_simd_2xnn called while GROMACS was configured without 2x(N+N) SIMD kernels enabled");
}
#endif
* To help us fund GROMACS development, we humbly ask that you cite
* the research papers on the package. Check out http://www.gromacs.org.
*/
-#ifndef _nbnxn_kernel_x86_simd128_h
-#define _nbnxn_kernel_x86_simd128_h
+#ifndef _nbnxn_kernel_simd_2xnn_h
+#define _nbnxn_kernel_simd_2xnn_h
#include "typedefs.h"
extern "C" {
#endif
-/* Wrapper call for the non-bonded cluster vs cluster kernels */
+/* Wrapper call for the non-bonded cluster vs cluster kernels.
+ * These kernels determine 4xN cluster interactions for SIMD width 2*N
+ * by packing 2*N j-atom variables in SIMD registers.
+ */
void
-nbnxn_kernel_x86_simd128(nbnxn_pairlist_set_t *nbl_list,
- const nbnxn_atomdata_t *nbat,
- const interaction_const_t *ic,
- int ewald_excl,
- rvec *shift_vec,
- int force_flags,
- int clearF,
- real *fshift,
- real *Vc,
- real *Vvdw);
+nbnxn_kernel_simd_2xnn(nbnxn_pairlist_set_t *nbl_list,
+ const nbnxn_atomdata_t *nbat,
+ const interaction_const_t *ic,
+ int ewald_excl,
+ rvec *shift_vec,
+ int force_flags,
+ int clearF,
+ real *fshift,
+ real *Vc,
+ real *Vvdw);
#ifdef __cplusplus
}
--- /dev/null
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2009, The GROMACS Development Team
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+/* This files includes all x86 SIMD kernel flavors.
+ * Only the Electrostatics type and optionally the VdW cut-off check
+ * need to be set before including this file.
+ */
+
+/* Include the force+energy kernels */
+#define CALC_ENERGIES
+#define LJ_COMB_GEOM
+#include "nbnxn_kernel_simd_2xnn_outer.h"
+#undef LJ_COMB_GEOM
+#define LJ_COMB_LB
+#include "nbnxn_kernel_simd_2xnn_outer.h"
+#undef LJ_COMB_LB
+#include "nbnxn_kernel_simd_2xnn_outer.h"
+#undef CALC_ENERGIES
+
+/* Include the force+energygroups kernels */
+#define CALC_ENERGIES
+#define ENERGY_GROUPS
+#define LJ_COMB_GEOM
+#include "nbnxn_kernel_simd_2xnn_outer.h"
+#undef LJ_COMB_GEOM
+#define LJ_COMB_LB
+#include "nbnxn_kernel_simd_2xnn_outer.h"
+#undef LJ_COMB_LB
+#include "nbnxn_kernel_simd_2xnn_outer.h"
+#undef ENERGY_GROUPS
+#undef CALC_ENERGIES
+
+/* Include the force only kernels */
+#define LJ_COMB_GEOM
+#include "nbnxn_kernel_simd_2xnn_outer.h"
+#undef LJ_COMB_GEOM
+#define LJ_COMB_LB
+#include "nbnxn_kernel_simd_2xnn_outer.h"
+#undef LJ_COMB_LB
+#include "nbnxn_kernel_simd_2xnn_outer.h"
--- /dev/null
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2009, The GROMACS Development Team
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+/* This is the innermost loop contents for the 4 x N atom SIMD kernel.
+ * This flavor of the kernel duplicates the data for N j-particles in
+ * 2xN wide SIMD registers to do operate on 2 i-particles at once.
+ * This leads to 4/2=2 sets of most instructions. Therefore we call
+ * this kernel 2x(N+N) = 2xnn
+ *
+ * This 2xnn kernel is basically the 4xn equivalent with half the registers
+ * and instructions removed.
+ *
+ * An alternative would be to load to different cluster of N j-particles
+ * into SIMD registers, giving a 4x(N+N) kernel. This doubles the amount
+ * of instructions, which could lead to better scheduling. But we actually
+ * observed worse scheduling for the AVX-256 4x8 normal analytical PME
+ * kernel, which has a lower pair throughput than 2x(4+4) with gcc 4.7.
+ * It could be worth trying this option, but it takes some more effort.
+ * This 2xnn kernel is basically the 4xn equivalent with
+ */
+
+
+/* When calculating RF or Ewald interactions we calculate the electrostatic
+ * forces on excluded atom pairs here in the non-bonded loops.
+ * But when energies and/or virial is required we calculate them
+ * separately to as then it is easier to separate the energy and virial
+ * contributions.
+ */
+#if defined CHECK_EXCLS && defined CALC_COULOMB
+#define EXCL_FORCES
+#endif
+
+/* Without exclusions and energies we only need to mask the cut-off,
+ * this can be faster with blendv (only available with SSE4.1 and later).
+ */
+#if !(defined CHECK_EXCLS || defined CALC_ENERGIES) && defined GMX_X86_SSE4_1 && !defined COUNT_PAIRS
+/* With RF and tabulated Coulomb we replace cmp+and with sub+blendv.
+ * With gcc this is slower, except for RF on Sandy Bridge.
+ * Tested with gcc 4.6.2, 4.6.3 and 4.7.1.
+ */
+#if (defined CALC_COUL_RF || defined CALC_COUL_TAB) && (!defined __GNUC__ || (defined CALC_COUL_RF && defined GMX_X86_AVX_256))
+#define CUTOFF_BLENDV
+#endif
+/* With analytical Ewald we replace cmp+and+and with sub+blendv+blendv.
+ * This is only faster with icc on Sandy Bridge (PS kernel slower than gcc 4.7).
+ * Tested with icc 13.
+ */
+#if defined CALC_COUL_EWALD && defined __INTEL_COMPILER && defined GMX_X86_AVX_256
+#define CUTOFF_BLENDV
+#endif
+#endif
+
+ {
+ int cj,aj,ajx,ajy,ajz;
+
+#ifdef ENERGY_GROUPS
+ /* Energy group indices for two atoms packed into one int */
+ int egp_jj[UNROLLJ/2];
+#endif
+
+#ifdef CHECK_EXCLS
+ /* Interaction (non-exclusion) mask of all 1's or 0's */
+ gmx_mm_pr int_SSE0;
+ gmx_mm_pr int_SSE2;
+#endif
+
+ gmx_mm_pr jxSSE,jySSE,jzSSE;
+ gmx_mm_pr dx_SSE0,dy_SSE0,dz_SSE0;
+ gmx_mm_pr dx_SSE2,dy_SSE2,dz_SSE2;
+ gmx_mm_pr tx_SSE0,ty_SSE0,tz_SSE0;
+ gmx_mm_pr tx_SSE2,ty_SSE2,tz_SSE2;
+ gmx_mm_pr rsq_SSE0,rinv_SSE0,rinvsq_SSE0;
+ gmx_mm_pr rsq_SSE2,rinv_SSE2,rinvsq_SSE2;
+#ifndef CUTOFF_BLENDV
+ /* wco: within cut-off, mask of all 1's or 0's */
+ gmx_mm_pr wco_SSE0;
+ gmx_mm_pr wco_SSE2;
+#endif
+#ifdef VDW_CUTOFF_CHECK
+ gmx_mm_pr wco_vdw_SSE0;
+#ifndef HALF_LJ
+ gmx_mm_pr wco_vdw_SSE2;
+#endif
+#endif
+#ifdef CALC_COULOMB
+#ifdef CHECK_EXCLS
+ /* 1/r masked with the interaction mask */
+ gmx_mm_pr rinv_ex_SSE0;
+ gmx_mm_pr rinv_ex_SSE2;
+#endif
+ gmx_mm_pr jq_SSE;
+ gmx_mm_pr qq_SSE0;
+ gmx_mm_pr qq_SSE2;
+#ifdef CALC_COUL_TAB
+ /* The force (PME mesh force) we need to subtract from 1/r^2 */
+ gmx_mm_pr fsub_SSE0;
+ gmx_mm_pr fsub_SSE2;
+#endif
+#ifdef CALC_COUL_EWALD
+ gmx_mm_pr brsq_SSE0,brsq_SSE2;
+ gmx_mm_pr ewcorr_SSE0,ewcorr_SSE2;
+#endif
+
+ /* frcoul = (1/r - fsub)*r */
+ gmx_mm_pr frcoul_SSE0;
+ gmx_mm_pr frcoul_SSE2;
+#ifdef CALC_COUL_TAB
+ /* For tables: r, rs=r/sp, rf=floor(rs), frac=rs-rf */
+ gmx_mm_pr r_SSE0,rs_SSE0,rf_SSE0,frac_SSE0;
+ gmx_mm_pr r_SSE2,rs_SSE2,rf_SSE2,frac_SSE2;
+ /* Table index: rs truncated to an int */
+#if !(defined GMX_MM256_HERE && defined GMX_DOUBLE)
+ gmx_epi32 ti_SSE0,ti_SSE2;
+#else
+ __m128i ti_SSE0,ti_SSE2;
+#endif
+ /* Linear force table values */
+ gmx_mm_pr ctab0_SSE0,ctab1_SSE0;
+ gmx_mm_pr ctab0_SSE2,ctab1_SSE2;
+#ifdef CALC_ENERGIES
+ /* Quadratic energy table value */
+ gmx_mm_pr ctabv_SSE0;
+ gmx_mm_pr ctabv_SSE2;
+#endif
+#endif
+#if defined CALC_ENERGIES && (defined CALC_COUL_EWALD || defined CALC_COUL_TAB)
+ /* The potential (PME mesh) we need to subtract from 1/r */
+ gmx_mm_pr vc_sub_SSE0;
+ gmx_mm_pr vc_sub_SSE2;
+#endif
+#ifdef CALC_ENERGIES
+ /* Electrostatic potential */
+ gmx_mm_pr vcoul_SSE0;
+ gmx_mm_pr vcoul_SSE2;
+#endif
+#endif
+ /* The force times 1/r */
+ gmx_mm_pr fscal_SSE0;
+ gmx_mm_pr fscal_SSE2;
+
+#ifdef CALC_LJ
+#ifdef LJ_COMB_LB
+ /* LJ sigma_j/2 and sqrt(epsilon_j) */
+ gmx_mm_pr hsig_j_SSE,seps_j_SSE;
+ /* LJ sigma_ij and epsilon_ij */
+ gmx_mm_pr sig_SSE0,eps_SSE0;
+#ifndef HALF_LJ
+ gmx_mm_pr sig_SSE2,eps_SSE2;
+#endif
+#ifdef CALC_ENERGIES
+ gmx_mm_pr sig2_SSE0,sig6_SSE0;
+#ifndef HALF_LJ
+ gmx_mm_pr sig2_SSE2,sig6_SSE2;
+#endif
+#endif /* LJ_COMB_LB */
+#endif /* CALC_LJ */
+
+#ifdef LJ_COMB_GEOM
+ gmx_mm_pr c6s_j_SSE,c12s_j_SSE;
+#endif
+
+#if defined LJ_COMB_GEOM || defined LJ_COMB_LB
+ /* Index for loading LJ parameters, complicated when interleaving */
+ int aj2;
+#endif
+
+#ifndef FIX_LJ_C
+ /* LJ C6 and C12 parameters, used with geometric comb. rule */
+ gmx_mm_pr c6_SSE0,c12_SSE0;
+#ifndef HALF_LJ
+ gmx_mm_pr c6_SSE2,c12_SSE2;
+#endif
+#endif
+
+ /* Intermediate variables for LJ calculation */
+#ifndef LJ_COMB_LB
+ gmx_mm_pr rinvsix_SSE0;
+#ifndef HALF_LJ
+ gmx_mm_pr rinvsix_SSE2;
+#endif
+#endif
+#ifdef LJ_COMB_LB
+ gmx_mm_pr sir_SSE0,sir2_SSE0,sir6_SSE0;
+#ifndef HALF_LJ
+ gmx_mm_pr sir_SSE2,sir2_SSE2,sir6_SSE2;
+#endif
+#endif
+
+ gmx_mm_pr FrLJ6_SSE0,FrLJ12_SSE0;
+#ifndef HALF_LJ
+ gmx_mm_pr FrLJ6_SSE2,FrLJ12_SSE2;
+#endif
+#ifdef CALC_ENERGIES
+ gmx_mm_pr VLJ6_SSE0,VLJ12_SSE0,VLJ_SSE0;
+#ifndef HALF_LJ
+ gmx_mm_pr VLJ6_SSE2,VLJ12_SSE2,VLJ_SSE2;
+#endif
+#endif
+#endif /* CALC_LJ */
+
+ /* j-cluster index */
+ cj = l_cj[cjind].cj;
+
+ /* Atom indices (of the first atom in the cluster) */
+ aj = cj*UNROLLJ;
+#if defined CALC_LJ && (defined LJ_COMB_GEOM || defined LJ_COMB_LB)
+#if UNROLLJ == STRIDE
+ aj2 = aj*2;
+#else
+ aj2 = (cj>>1)*2*STRIDE + (cj & 1)*UNROLLJ;
+#endif
+#endif
+#if UNROLLJ == STRIDE
+ ajx = aj*DIM;
+#else
+ ajx = (cj>>1)*DIM*STRIDE + (cj & 1)*UNROLLJ;
+#endif
+ ajy = ajx + STRIDE;
+ ajz = ajy + STRIDE;
+
+#ifdef CHECK_EXCLS
+ {
+ /* Load integer interaction mask */
+ /* With AVX there are no integer operations, so cast to real */
+ gmx_mm_pr mask_pr = gmx_mm_castsi256_pr(_mm256_set1_epi32(l_cj[cjind].excl));
+ /* Intel Compiler version 12.1.3 20120130 is buggy: use cast.
+ * With gcc we don't need the cast, but it's faster.
+ */
+#define cast_cvt(x) _mm256_cvtepi32_ps(_mm256_castps_si256(x))
+ int_SSE0 = gmx_cmpneq_pr(cast_cvt(gmx_and_pr(mask_pr,mask0)),zero_SSE);
+ int_SSE2 = gmx_cmpneq_pr(cast_cvt(gmx_and_pr(mask_pr,mask2)),zero_SSE);
+#undef cast_cvt
+ }
+#endif
+ /* load j atom coordinates */
+ jxSSE = gmx_loaddh_pr(x+ajx);
+ jySSE = gmx_loaddh_pr(x+ajy);
+ jzSSE = gmx_loaddh_pr(x+ajz);
+
+ /* Calculate distance */
+ dx_SSE0 = gmx_sub_pr(ix_SSE0,jxSSE);
+ dy_SSE0 = gmx_sub_pr(iy_SSE0,jySSE);
+ dz_SSE0 = gmx_sub_pr(iz_SSE0,jzSSE);
+ dx_SSE2 = gmx_sub_pr(ix_SSE2,jxSSE);
+ dy_SSE2 = gmx_sub_pr(iy_SSE2,jySSE);
+ dz_SSE2 = gmx_sub_pr(iz_SSE2,jzSSE);
+
+ /* rsq = dx*dx+dy*dy+dz*dz */
+ rsq_SSE0 = gmx_calc_rsq_pr(dx_SSE0,dy_SSE0,dz_SSE0);
+ rsq_SSE2 = gmx_calc_rsq_pr(dx_SSE2,dy_SSE2,dz_SSE2);
+
+#ifndef CUTOFF_BLENDV
+ wco_SSE0 = gmx_cmplt_pr(rsq_SSE0,rc2_SSE);
+ wco_SSE2 = gmx_cmplt_pr(rsq_SSE2,rc2_SSE);
+#endif
+
+#ifdef CHECK_EXCLS
+#ifdef EXCL_FORCES
+ /* Only remove the (sub-)diagonal to avoid double counting */
+#if UNROLLJ == UNROLLI
+ if (cj == ci_sh)
+ {
+ wco_SSE0 = gmx_and_pr(wco_SSE0,diag_SSE0);
+ wco_SSE2 = gmx_and_pr(wco_SSE2,diag_SSE2);
+ }
+#else
+#error "only UNROLLJ == UNROLLI currently supported in the joined kernels"
+#endif
+#else /* EXCL_FORCES */
+ /* Remove all excluded atom pairs from the list */
+ wco_SSE0 = gmx_and_pr(wco_SSE0,int_SSE0);
+ wco_SSE2 = gmx_and_pr(wco_SSE2,int_SSE2);
+#endif
+#endif
+
+#ifdef COUNT_PAIRS
+ {
+ int i,j;
+ real tmp[UNROLLJ];
+ for(i=0; i<UNROLLI; i++)
+ {
+ gmx_storeu_pr(tmp,i==0 ? wco_SSE0 : (i==1 ? wco_SSE1 : (i==2 ? wco_SSE2 : wco_SSE3)));
+ for(j=0; j<UNROLLJ; j++)
+ {
+ if (!(tmp[j] == 0))
+ {
+ npair++;
+ }
+ }
+ }
+ }
+#endif
+
+#ifdef CHECK_EXCLS
+ /* For excluded pairs add a small number to avoid r^-6 = NaN */
+ rsq_SSE0 = gmx_add_pr(rsq_SSE0,gmx_andnot_pr(int_SSE0,avoid_sing_SSE));
+ rsq_SSE2 = gmx_add_pr(rsq_SSE2,gmx_andnot_pr(int_SSE2,avoid_sing_SSE));
+#endif
+
+ /* Calculate 1/r */
+ rinv_SSE0 = gmx_invsqrt_pr(rsq_SSE0);
+ rinv_SSE2 = gmx_invsqrt_pr(rsq_SSE2);
+
+#ifdef CALC_COULOMB
+ /* Load parameters for j atom */
+ jq_SSE = gmx_loaddh_pr(q+aj);
+ qq_SSE0 = gmx_mul_pr(iq_SSE0,jq_SSE);
+ qq_SSE2 = gmx_mul_pr(iq_SSE2,jq_SSE);
+#endif
+
+#ifdef CALC_LJ
+
+#if !defined LJ_COMB_GEOM && !defined LJ_COMB_LB && !defined FIX_LJ_C
+ load_lj_pair_params2(nbfp0,type,aj,c6_SSE0,c12_SSE0);
+#ifndef HALF_LJ
+ load_lj_pair_params2(nbfp2,type,aj,c6_SSE2,c12_SSE2);
+#endif
+#endif /* not defined any LJ rule */
+
+#ifdef LJ_COMB_GEOM
+ c6s_j_SSE = gmx_loaddh_pr(ljc+aj2+0);
+ c12s_j_SSE = gmx_loaddh_pr(ljc+aj2+STRIDE);
+ c6_SSE0 = gmx_mul_pr(c6s_SSE0 ,c6s_j_SSE );
+#ifndef HALF_LJ
+ c6_SSE2 = gmx_mul_pr(c6s_SSE2 ,c6s_j_SSE );
+#endif
+ c12_SSE0 = gmx_mul_pr(c12s_SSE0,c12s_j_SSE);
+#ifndef HALF_LJ
+ c12_SSE2 = gmx_mul_pr(c12s_SSE2,c12s_j_SSE);
+#endif
+#endif /* LJ_COMB_GEOM */
+
+#ifdef LJ_COMB_LB
+ hsig_j_SSE = gmx_loaddh_pr(ljc+aj2+0);
+ seps_j_SSE = gmx_loaddh_pr(ljc+aj2+STRIDE);
+
+ sig_SSE0 = gmx_add_pr(hsig_i_SSE0,hsig_j_SSE);
+ eps_SSE0 = gmx_mul_pr(seps_i_SSE0,seps_j_SSE);
+#ifndef HALF_LJ
+ sig_SSE2 = gmx_add_pr(hsig_i_SSE2,hsig_j_SSE);
+ eps_SSE2 = gmx_mul_pr(seps_i_SSE2,seps_j_SSE);
+#endif
+#endif /* LJ_COMB_LB */
+
+#endif /* CALC_LJ */
+
+#ifndef CUTOFF_BLENDV
+ rinv_SSE0 = gmx_and_pr(rinv_SSE0,wco_SSE0);
+ rinv_SSE2 = gmx_and_pr(rinv_SSE2,wco_SSE2);
+#else
+ /* We only need to mask for the cut-off: blendv is faster */
+ rinv_SSE0 = gmx_blendv_pr(rinv_SSE0,zero_SSE,gmx_sub_pr(rc2_SSE,rsq_SSE0));
+ rinv_SSE2 = gmx_blendv_pr(rinv_SSE2,zero_SSE,gmx_sub_pr(rc2_SSE,rsq_SSE2));
+#endif
+
+ rinvsq_SSE0 = gmx_mul_pr(rinv_SSE0,rinv_SSE0);
+ rinvsq_SSE2 = gmx_mul_pr(rinv_SSE2,rinv_SSE2);
+
+#ifdef CALC_COULOMB
+ /* Note that here we calculate force*r, not the usual force/r.
+ * This allows avoiding masking the reaction-field contribution,
+ * as frcoul is later multiplied by rinvsq which has been
+ * masked with the cut-off check.
+ */
+
+#ifdef EXCL_FORCES
+ /* Only add 1/r for non-excluded atom pairs */
+ rinv_ex_SSE0 = gmx_and_pr(rinv_SSE0,int_SSE0);
+ rinv_ex_SSE2 = gmx_and_pr(rinv_SSE2,int_SSE2);
+#else
+ /* No exclusion forces, we always need 1/r */
+#define rinv_ex_SSE0 rinv_SSE0
+#define rinv_ex_SSE2 rinv_SSE2
+#endif
+
+#ifdef CALC_COUL_RF
+ /* Electrostatic interactions */
+ frcoul_SSE0 = gmx_mul_pr(qq_SSE0,gmx_add_pr(rinv_ex_SSE0,gmx_mul_pr(rsq_SSE0,mrc_3_SSE)));
+ frcoul_SSE2 = gmx_mul_pr(qq_SSE2,gmx_add_pr(rinv_ex_SSE2,gmx_mul_pr(rsq_SSE2,mrc_3_SSE)));
+
+#ifdef CALC_ENERGIES
+ vcoul_SSE0 = gmx_mul_pr(qq_SSE0,gmx_add_pr(rinv_ex_SSE0,gmx_add_pr(gmx_mul_pr(rsq_SSE0,hrc_3_SSE),moh_rc_SSE)));
+ vcoul_SSE2 = gmx_mul_pr(qq_SSE2,gmx_add_pr(rinv_ex_SSE2,gmx_add_pr(gmx_mul_pr(rsq_SSE2,hrc_3_SSE),moh_rc_SSE)));
+#endif
+#endif
+
+#ifdef CALC_COUL_EWALD
+ /* We need to mask (or limit) rsq for the cut-off,
+ * as large distances can cause an overflow in gmx_pmecorrF/V.
+ */
+#ifndef CUTOFF_BLENDV
+ brsq_SSE0 = gmx_mul_pr(beta2_SSE,gmx_and_pr(rsq_SSE0,wco_SSE0));
+ brsq_SSE2 = gmx_mul_pr(beta2_SSE,gmx_and_pr(rsq_SSE2,wco_SSE2));
+#else
+ /* Strangely, putting mul on a separate line is slower (icc 13) */
+ brsq_SSE0 = gmx_mul_pr(beta2_SSE,gmx_blendv_pr(rsq_SSE0,zero_SSE,gmx_sub_pr(rc2_SSE,rsq_SSE0)));
+ brsq_SSE2 = gmx_mul_pr(beta2_SSE,gmx_blendv_pr(rsq_SSE2,zero_SSE,gmx_sub_pr(rc2_SSE,rsq_SSE2)));
+#endif
+ ewcorr_SSE0 = gmx_mul_pr(gmx_pmecorrF_pr(brsq_SSE0),beta_SSE);
+ ewcorr_SSE2 = gmx_mul_pr(gmx_pmecorrF_pr(brsq_SSE2),beta_SSE);
+ frcoul_SSE0 = gmx_mul_pr(qq_SSE0,gmx_add_pr(rinv_ex_SSE0,gmx_mul_pr(ewcorr_SSE0,brsq_SSE0)));
+ frcoul_SSE2 = gmx_mul_pr(qq_SSE2,gmx_add_pr(rinv_ex_SSE2,gmx_mul_pr(ewcorr_SSE2,brsq_SSE2)));
+
+#ifdef CALC_ENERGIES
+ vc_sub_SSE0 = gmx_mul_pr(gmx_pmecorrV_pr(brsq_SSE0),beta_SSE);
+ vc_sub_SSE2 = gmx_mul_pr(gmx_pmecorrV_pr(brsq_SSE2),beta_SSE);
+#endif
+
+#endif /* CALC_COUL_EWALD */
+
+#ifdef CALC_COUL_TAB
+ /* Electrostatic interactions */
+ r_SSE0 = gmx_mul_pr(rsq_SSE0,rinv_SSE0);
+ r_SSE2 = gmx_mul_pr(rsq_SSE2,rinv_SSE2);
+ /* Convert r to scaled table units */
+ rs_SSE0 = gmx_mul_pr(r_SSE0,invtsp_SSE);
+ rs_SSE2 = gmx_mul_pr(r_SSE2,invtsp_SSE);
+ /* Truncate scaled r to an int */
+ ti_SSE0 = gmx_cvttpr_epi32(rs_SSE0);
+ ti_SSE2 = gmx_cvttpr_epi32(rs_SSE2);
+#ifdef GMX_X86_SSE4_1
+ /* SSE4.1 floor is faster than gmx_cvtepi32_ps int->float cast */
+ rf_SSE0 = gmx_floor_pr(rs_SSE0);
+ rf_SSE2 = gmx_floor_pr(rs_SSE2);
+#else
+ rf_SSE0 = gmx_cvtepi32_pr(ti_SSE0);
+ rf_SSE2 = gmx_cvtepi32_pr(ti_SSE2);
+#endif
+ frac_SSE0 = gmx_sub_pr(rs_SSE0,rf_SSE0);
+ frac_SSE2 = gmx_sub_pr(rs_SSE2,rf_SSE2);
+
+ /* Load and interpolate table forces and possibly energies.
+ * Force and energy can be combined in one table, stride 4: FDV0
+ * or in two separate tables with stride 1: F and V
+ * Currently single precision uses FDV0, double F and V.
+ */
+#ifndef CALC_ENERGIES
+ load_table_f(tab_coul_F,ti_SSE0,ti0,ctab0_SSE0,ctab1_SSE0);
+ load_table_f(tab_coul_F,ti_SSE2,ti2,ctab0_SSE2,ctab1_SSE2);
+#else
+#ifdef TAB_FDV0
+ load_table_f_v(tab_coul_F,ti_SSE0,ti0,ctab0_SSE0,ctab1_SSE0,ctabv_SSE0);
+ load_table_f_v(tab_coul_F,ti_SSE2,ti2,ctab0_SSE2,ctab1_SSE2,ctabv_SSE2);
+#else
+ load_table_f_v(tab_coul_F,tab_coul_V,ti_SSE0,ti0,ctab0_SSE0,ctab1_SSE0,ctabv_SSE0);
+ load_table_f_v(tab_coul_F,tab_coul_V,ti_SSE2,ti2,ctab0_SSE2,ctab1_SSE2,ctabv_SSE2);
+#endif
+#endif
+ fsub_SSE0 = gmx_add_pr(ctab0_SSE0,gmx_mul_pr(frac_SSE0,ctab1_SSE0));
+ fsub_SSE2 = gmx_add_pr(ctab0_SSE2,gmx_mul_pr(frac_SSE2,ctab1_SSE2));
+ frcoul_SSE0 = gmx_mul_pr(qq_SSE0,gmx_sub_pr(rinv_ex_SSE0,gmx_mul_pr(fsub_SSE0,r_SSE0)));
+ frcoul_SSE2 = gmx_mul_pr(qq_SSE2,gmx_sub_pr(rinv_ex_SSE2,gmx_mul_pr(fsub_SSE2,r_SSE2)));
+
+#ifdef CALC_ENERGIES
+ vc_sub_SSE0 = gmx_add_pr(ctabv_SSE0,gmx_mul_pr(gmx_mul_pr(mhalfsp_SSE,frac_SSE0),gmx_add_pr(ctab0_SSE0,fsub_SSE0)));
+ vc_sub_SSE2 = gmx_add_pr(ctabv_SSE2,gmx_mul_pr(gmx_mul_pr(mhalfsp_SSE,frac_SSE2),gmx_add_pr(ctab0_SSE2,fsub_SSE2)));
+#endif
+#endif /* CALC_COUL_TAB */
+
+#if defined CALC_ENERGIES && (defined CALC_COUL_EWALD || defined CALC_COUL_TAB)
+#ifndef NO_SHIFT_EWALD
+ /* Add Ewald potential shift to vc_sub for convenience */
+#ifdef CHECK_EXCLS
+ vc_sub_SSE0 = gmx_add_pr(vc_sub_SSE0,gmx_and_pr(sh_ewald_SSE,int_SSE0));
+ vc_sub_SSE2 = gmx_add_pr(vc_sub_SSE2,gmx_and_pr(sh_ewald_SSE,int_SSE2));
+#else
+ vc_sub_SSE0 = gmx_add_pr(vc_sub_SSE0,sh_ewald_SSE);
+ vc_sub_SSE2 = gmx_add_pr(vc_sub_SSE2,sh_ewald_SSE);
+#endif
+#endif
+
+ vcoul_SSE0 = gmx_mul_pr(qq_SSE0,gmx_sub_pr(rinv_ex_SSE0,vc_sub_SSE0));
+ vcoul_SSE2 = gmx_mul_pr(qq_SSE2,gmx_sub_pr(rinv_ex_SSE2,vc_sub_SSE2));
+#endif
+
+#ifdef CALC_ENERGIES
+ /* Mask energy for cut-off and diagonal */
+ vcoul_SSE0 = gmx_and_pr(vcoul_SSE0,wco_SSE0);
+ vcoul_SSE2 = gmx_and_pr(vcoul_SSE2,wco_SSE2);
+#endif
+
+#endif /* CALC_COULOMB */
+
+#ifdef CALC_LJ
+ /* Lennard-Jones interaction */
+
+#ifdef VDW_CUTOFF_CHECK
+ wco_vdw_SSE0 = gmx_cmplt_pr(rsq_SSE0,rcvdw2_SSE);
+#ifndef HALF_LJ
+ wco_vdw_SSE2 = gmx_cmplt_pr(rsq_SSE2,rcvdw2_SSE);
+#endif
+#else
+ /* Same cut-off for Coulomb and VdW, reuse the registers */
+#define wco_vdw_SSE0 wco_SSE0
+#define wco_vdw_SSE2 wco_SSE2
+#endif
+
+#ifndef LJ_COMB_LB
+ rinvsix_SSE0 = gmx_mul_pr(rinvsq_SSE0,gmx_mul_pr(rinvsq_SSE0,rinvsq_SSE0));
+#ifdef EXCL_FORCES
+ rinvsix_SSE0 = gmx_and_pr(rinvsix_SSE0,int_SSE0);
+#endif
+#ifndef HALF_LJ
+ rinvsix_SSE2 = gmx_mul_pr(rinvsq_SSE2,gmx_mul_pr(rinvsq_SSE2,rinvsq_SSE2));
+#ifdef EXCL_FORCES
+ rinvsix_SSE2 = gmx_and_pr(rinvsix_SSE2,int_SSE2);
+#endif
+#endif
+#ifdef VDW_CUTOFF_CHECK
+ rinvsix_SSE0 = gmx_and_pr(rinvsix_SSE0,wco_vdw_SSE0);
+#ifndef HALF_LJ
+ rinvsix_SSE2 = gmx_and_pr(rinvsix_SSE2,wco_vdw_SSE2);
+#endif
+#endif
+ FrLJ6_SSE0 = gmx_mul_pr(c6_SSE0,rinvsix_SSE0);
+#ifndef HALF_LJ
+ FrLJ6_SSE2 = gmx_mul_pr(c6_SSE2,rinvsix_SSE2);
+#endif
+ FrLJ12_SSE0 = gmx_mul_pr(c12_SSE0,gmx_mul_pr(rinvsix_SSE0,rinvsix_SSE0));
+#ifndef HALF_LJ
+ FrLJ12_SSE2 = gmx_mul_pr(c12_SSE2,gmx_mul_pr(rinvsix_SSE2,rinvsix_SSE2));
+#endif
+#endif /* not LJ_COMB_LB */
+
+#ifdef LJ_COMB_LB
+ sir_SSE0 = gmx_mul_pr(sig_SSE0,rinv_SSE0);
+#ifndef HALF_LJ
+ sir_SSE2 = gmx_mul_pr(sig_SSE2,rinv_SSE2);
+#endif
+ sir2_SSE0 = gmx_mul_pr(sir_SSE0,sir_SSE0);
+#ifndef HALF_LJ
+ sir2_SSE2 = gmx_mul_pr(sir_SSE2,sir_SSE2);
+#endif
+ sir6_SSE0 = gmx_mul_pr(sir2_SSE0,gmx_mul_pr(sir2_SSE0,sir2_SSE0));
+#ifdef EXCL_FORCES
+ sir6_SSE0 = gmx_and_pr(sir6_SSE0,int_SSE0);
+#endif
+#ifndef HALF_LJ
+ sir6_SSE2 = gmx_mul_pr(sir2_SSE2,gmx_mul_pr(sir2_SSE2,sir2_SSE2));
+#ifdef EXCL_FORCES
+ sir6_SSE2 = gmx_and_pr(sir6_SSE2,int_SSE2);
+#endif
+#endif
+#ifdef VDW_CUTOFF_CHECK
+ sir6_SSE0 = gmx_and_pr(sir6_SSE0,wco_vdw_SSE0);
+#ifndef HALF_LJ
+ sir6_SSE2 = gmx_and_pr(sir6_SSE2,wco_vdw_SSE2);
+#endif
+#endif
+ FrLJ6_SSE0 = gmx_mul_pr(eps_SSE0,sir6_SSE0);
+#ifndef HALF_LJ
+ FrLJ6_SSE2 = gmx_mul_pr(eps_SSE2,sir6_SSE2);
+#endif
+ FrLJ12_SSE0 = gmx_mul_pr(FrLJ6_SSE0,sir6_SSE0);
+#ifndef HALF_LJ
+ FrLJ12_SSE2 = gmx_mul_pr(FrLJ6_SSE2,sir6_SSE2);
+#endif
+#if defined CALC_ENERGIES
+ /* We need C6 and C12 to calculate the LJ potential shift */
+ sig2_SSE0 = gmx_mul_pr(sig_SSE0,sig_SSE0);
+#ifndef HALF_LJ
+ sig2_SSE2 = gmx_mul_pr(sig_SSE2,sig_SSE2);
+#endif
+ sig6_SSE0 = gmx_mul_pr(sig2_SSE0,gmx_mul_pr(sig2_SSE0,sig2_SSE0));
+#ifndef HALF_LJ
+ sig6_SSE2 = gmx_mul_pr(sig2_SSE2,gmx_mul_pr(sig2_SSE2,sig2_SSE2));
+#endif
+ c6_SSE0 = gmx_mul_pr(eps_SSE0,sig6_SSE0);
+#ifndef HALF_LJ
+ c6_SSE2 = gmx_mul_pr(eps_SSE2,sig6_SSE2);
+#endif
+ c12_SSE0 = gmx_mul_pr(c6_SSE0,sig6_SSE0);
+#ifndef HALF_LJ
+ c12_SSE2 = gmx_mul_pr(c6_SSE2,sig6_SSE2);
+#endif
+#endif
+#endif /* LJ_COMB_LB */
+
+#endif /* CALC_LJ */
+
+#ifdef CALC_ENERGIES
+#ifdef ENERGY_GROUPS
+ /* Extract the group pair index per j pair.
+ * Energy groups are stored per i-cluster, so things get
+ * complicated when the i- and j-cluster size don't match.
+ */
+ {
+ int egps_j;
+#if UNROLLJ == 2
+ egps_j = nbat->energrp[cj>>1];
+ egp_jj[0] = ((egps_j >> ((cj & 1)*egps_jshift)) & egps_jmask)*egps_jstride;
+#else
+ /* We assume UNROLLI <= UNROLLJ */
+ int jdi;
+ for(jdi=0; jdi<UNROLLJ/UNROLLI; jdi++)
+ {
+ int jj;
+ egps_j = nbat->energrp[cj*(UNROLLJ/UNROLLI)+jdi];
+ for(jj=0; jj<(UNROLLI/2); jj++)
+ {
+ egp_jj[jdi*(UNROLLI/2)+jj] = ((egps_j >> (jj*egps_jshift)) & egps_jmask)*egps_jstride;
+ }
+ }
+#endif
+ }
+#endif
+
+#ifdef CALC_COULOMB
+#ifndef ENERGY_GROUPS
+ vctotSSE = gmx_add_pr(vctotSSE, gmx_add_pr(vcoul_SSE0,vcoul_SSE2));
+#else
+ add_ener_grp_halves(vcoul_SSE0,vctp[0],vctp[1],egp_jj);
+ add_ener_grp_halves(vcoul_SSE2,vctp[2],vctp[3],egp_jj);
+#endif
+#endif
+
+#ifdef CALC_LJ
+ /* Calculate the LJ energies */
+ VLJ6_SSE0 = gmx_mul_pr(sixthSSE,gmx_sub_pr(FrLJ6_SSE0,gmx_mul_pr(c6_SSE0,sh_invrc6_SSE)));
+#ifndef HALF_LJ
+ VLJ6_SSE2 = gmx_mul_pr(sixthSSE,gmx_sub_pr(FrLJ6_SSE2,gmx_mul_pr(c6_SSE2,sh_invrc6_SSE)));
+#endif
+ VLJ12_SSE0 = gmx_mul_pr(twelvethSSE,gmx_sub_pr(FrLJ12_SSE0,gmx_mul_pr(c12_SSE0,sh_invrc12_SSE)));
+#ifndef HALF_LJ
+ VLJ12_SSE2 = gmx_mul_pr(twelvethSSE,gmx_sub_pr(FrLJ12_SSE2,gmx_mul_pr(c12_SSE2,sh_invrc12_SSE)));
+#endif
+
+ VLJ_SSE0 = gmx_sub_pr(VLJ12_SSE0,VLJ6_SSE0);
+#ifndef HALF_LJ
+ VLJ_SSE2 = gmx_sub_pr(VLJ12_SSE2,VLJ6_SSE2);
+#endif
+ /* The potential shift should be removed for pairs beyond cut-off */
+ VLJ_SSE0 = gmx_and_pr(VLJ_SSE0,wco_vdw_SSE0);
+#ifndef HALF_LJ
+ VLJ_SSE2 = gmx_and_pr(VLJ_SSE2,wco_vdw_SSE2);
+#endif
+#ifdef CHECK_EXCLS
+ /* The potential shift should be removed for excluded pairs */
+ VLJ_SSE0 = gmx_and_pr(VLJ_SSE0,int_SSE0);
+#ifndef HALF_LJ
+ VLJ_SSE2 = gmx_and_pr(VLJ_SSE2,int_SSE2);
+#endif
+#endif
+#ifndef ENERGY_GROUPS
+ VvdwtotSSE = gmx_add_pr(VvdwtotSSE,
+#ifndef HALF_LJ
+ gmx_add_pr(VLJ_SSE0,VLJ_SSE2)
+#else
+ VLJ_SSE0
+#endif
+ );
+#else
+ add_ener_grp_halves(VLJ_SSE0,vvdwtp[0],vvdwtp[1],egp_jj);
+#ifndef HALF_LJ
+ add_ener_grp_halves(VLJ_SSE2,vvdwtp[2],vvdwtp[3],egp_jj);
+#endif
+#endif
+#endif /* CALC_LJ */
+#endif /* CALC_ENERGIES */
+
+#ifdef CALC_LJ
+ fscal_SSE0 = gmx_mul_pr(rinvsq_SSE0,
+#ifdef CALC_COULOMB
+ gmx_add_pr(frcoul_SSE0,
+#else
+ (
+#endif
+ gmx_sub_pr(FrLJ12_SSE0,FrLJ6_SSE0)));
+#else
+ fscal_SSE0 = gmx_mul_pr(rinvsq_SSE0,frcoul_SSE0);
+#endif /* CALC_LJ */
+#if defined CALC_LJ && !defined HALF_LJ
+ fscal_SSE2 = gmx_mul_pr(rinvsq_SSE2,
+#ifdef CALC_COULOMB
+ gmx_add_pr(frcoul_SSE2,
+#else
+ (
+#endif
+ gmx_sub_pr(FrLJ12_SSE2,FrLJ6_SSE2)));
+#else
+ /* Atom 2 and 3 don't have LJ, so only add Coulomb forces */
+ fscal_SSE2 = gmx_mul_pr(rinvsq_SSE2,frcoul_SSE2);
+#endif
+
+ /* Calculate temporary vectorial force */
+ tx_SSE0 = gmx_mul_pr(fscal_SSE0,dx_SSE0);
+ tx_SSE2 = gmx_mul_pr(fscal_SSE2,dx_SSE2);
+ ty_SSE0 = gmx_mul_pr(fscal_SSE0,dy_SSE0);
+ ty_SSE2 = gmx_mul_pr(fscal_SSE2,dy_SSE2);
+ tz_SSE0 = gmx_mul_pr(fscal_SSE0,dz_SSE0);
+ tz_SSE2 = gmx_mul_pr(fscal_SSE2,dz_SSE2);
+
+ /* Increment i atom force */
+ fix_SSE0 = gmx_add_pr(fix_SSE0,tx_SSE0);
+ fix_SSE2 = gmx_add_pr(fix_SSE2,tx_SSE2);
+ fiy_SSE0 = gmx_add_pr(fiy_SSE0,ty_SSE0);
+ fiy_SSE2 = gmx_add_pr(fiy_SSE2,ty_SSE2);
+ fiz_SSE0 = gmx_add_pr(fiz_SSE0,tz_SSE0);
+ fiz_SSE2 = gmx_add_pr(fiz_SSE2,tz_SSE2);
+
+ /* Decrement j atom force */
+ gmx_store_hpr(f+ajx,
+ gmx_sub_hpr( gmx_load_hpr(f+ajx), gmx_sum4_hpr(tx_SSE0,tx_SSE2) ));
+ gmx_store_hpr(f+ajy,
+ gmx_sub_hpr( gmx_load_hpr(f+ajy), gmx_sum4_hpr(ty_SSE0,ty_SSE2) ));
+ gmx_store_hpr(f+ajz,
+ gmx_sub_hpr( gmx_load_hpr(f+ajz), gmx_sum4_hpr(tz_SSE0,tz_SSE2) ));
+ }
+
+#undef rinv_ex_SSE0
+#undef rinv_ex_SSE2
+
+#undef wco_vdw_SSE0
+#undef wco_vdw_SSE2
+
+#undef CUTOFF_BLENDV
+
+#undef EXCL_FORCES
--- /dev/null
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2009, The GROMACS Development Team
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+/* GMX_MM128_HERE or GMX_MM256_HERE should be set before including this file */
+#include "gmx_simd_macros.h"
+
+#define SUM_SIMD4(x) (x[0]+x[1]+x[2]+x[3])
+
+#define UNROLLI NBNXN_CPU_CLUSTER_I_SIZE
+#define UNROLLJ (GMX_SIMD_WIDTH_HERE/2)
+
+#if defined GMX_MM128_HERE || defined GMX_DOUBLE
+#define STRIDE 4
+#endif
+#if defined GMX_MM256_HERE && !defined GMX_DOUBLE
+#define STRIDE 4
+#endif
+
+#ifdef GMX_MM128_HERE
+#ifndef GMX_DOUBLE
+/* SSE single precision 4x4 kernel */
+#define SUM_SIMD(x) SUM_SIMD4(x)
+#define TAB_FDV0
+#else
+/* SSE double precision 4x2 kernel */
+#define SUM_SIMD(x) (x[0]+x[1])
+#endif
+#endif
+
+#ifdef GMX_MM256_HERE
+#ifndef GMX_DOUBLE
+/* AVX single precision 4x8 kernel */
+#define SUM_SIMD(x) (x[0]+x[1]+x[2]+x[3]+x[4]+x[5]+x[6]+x[7])
+#define TAB_FDV0
+#else
+/* AVX double precision 4x4 kernel */
+#define SUM_SIMD(x) SUM_SIMD4(x)
+#endif
+#endif
+
+#define SIMD_MASK_ALL 0xffffffff
+
+#include "nbnxn_kernel_simd_utils.h"
+
+/* All functionality defines are set here, except for:
+ * CALC_ENERGIES, ENERGY_GROUPS which are defined before.
+ * CHECK_EXCLS, which is set just before including the inner loop contents.
+ * The combination rule defines, LJ_COMB_GEOM or LJ_COMB_LB are currently
+ * set before calling the kernel function. We might want to move that
+ * to inside the n-loop and have a different combination rule for different
+ * ci's, as no combination rule gives a 50% performance hit for LJ.
+ */
+
+/* We always calculate shift forces, because it's cheap anyhow */
+#define CALC_SHIFTFORCES
+
+/* Assumes all LJ parameters are identical */
+/* #define FIX_LJ_C */
+
+/* The NBK_FUNC_NAME... macros below generate the whole zoo of kernels names
+ * with all combinations off electrostatics (coul), LJ combination rules (ljc)
+ * and energy calculations (ene), depending on the defines set.
+ */
+
+#define NBK_FUNC_NAME_C_LJC(base,coul,ljc,ene) base##_##coul##_comb_##ljc##_##ene
+
+#if defined LJ_COMB_GEOM
+#define NBK_FUNC_NAME_C(base,coul,ene) NBK_FUNC_NAME_C_LJC(base,coul,geom,ene)
+#else
+#if defined LJ_COMB_LB
+#define NBK_FUNC_NAME_C(base,coul,ene) NBK_FUNC_NAME_C_LJC(base,coul,lb,ene)
+#else
+#define NBK_FUNC_NAME_C(base,coul,ene) NBK_FUNC_NAME_C_LJC(base,coul,none,ene)
+#endif
+#endif
+
+#ifdef CALC_COUL_RF
+#define NBK_FUNC_NAME(base,ene) NBK_FUNC_NAME_C(base,rf,ene)
+#endif
+#ifdef CALC_COUL_TAB
+#ifndef VDW_CUTOFF_CHECK
+#define NBK_FUNC_NAME(base,ene) NBK_FUNC_NAME_C(base,tab,ene)
+#else
+#define NBK_FUNC_NAME(base,ene) NBK_FUNC_NAME_C(base,tab_twin,ene)
+#endif
+#endif
+#ifdef CALC_COUL_EWALD
+#ifndef VDW_CUTOFF_CHECK
+#define NBK_FUNC_NAME(base,ene) NBK_FUNC_NAME_C(base,ewald,ene)
+#else
+#define NBK_FUNC_NAME(base,ene) NBK_FUNC_NAME_C(base,ewald_twin,ene)
+#endif
+#endif
+
+static void
+#ifndef CALC_ENERGIES
+NBK_FUNC_NAME(nbnxn_kernel_simd_2xnn,noener)
+#else
+#ifndef ENERGY_GROUPS
+NBK_FUNC_NAME(nbnxn_kernel_simd_2xnn,ener)
+#else
+NBK_FUNC_NAME(nbnxn_kernel_simd_2xnn,energrp)
+#endif
+#endif
+#undef NBK_FUNC_NAME
+#undef NBK_FUNC_NAME_C
+#undef NBK_FUNC_NAME_C_LJC
+ (const nbnxn_pairlist_t *nbl,
+ const nbnxn_atomdata_t *nbat,
+ const interaction_const_t *ic,
+ rvec *shift_vec,
+ real *f
+#ifdef CALC_SHIFTFORCES
+ ,
+ real *fshift
+#endif
+#ifdef CALC_ENERGIES
+ ,
+ real *Vvdw,
+ real *Vc
+#endif
+ )
+{
+ const nbnxn_ci_t *nbln;
+ const nbnxn_cj_t *l_cj;
+ const int *type;
+ const real *q;
+ const real *shiftvec;
+ const real *x;
+ const real *nbfp0,*nbfp1,*nbfp2=NULL,*nbfp3=NULL;
+ real facel;
+ real *nbfp_ptr;
+ int nbfp_stride;
+ int n,ci,ci_sh;
+ int ish,ish3;
+ gmx_bool half_LJ,do_coul;
+ int sci,scix,sciy,sciz,sci2;
+ int cjind0,cjind1,cjind;
+ int ip,jp;
+
+#ifdef ENERGY_GROUPS
+ int Vstride_i;
+ int egps_ishift,egps_imask;
+ int egps_jshift,egps_jmask,egps_jstride;
+ int egps_i;
+ real *vvdwtp[UNROLLI];
+ real *vctp[UNROLLI];
+#endif
+
+ gmx_mm_pr shX_SSE;
+ gmx_mm_pr shY_SSE;
+ gmx_mm_pr shZ_SSE;
+ gmx_mm_pr ix_SSE0,iy_SSE0,iz_SSE0;
+ gmx_mm_pr ix_SSE2,iy_SSE2,iz_SSE2;
+ gmx_mm_pr fix_SSE0,fiy_SSE0,fiz_SSE0;
+ gmx_mm_pr fix_SSE2,fiy_SSE2,fiz_SSE2;
+#if UNROLLJ >= 4
+#ifndef GMX_DOUBLE
+ __m128 fix_SSE,fiy_SSE,fiz_SSE;
+#else
+ __m256d fix_SSE,fiy_SSE,fiz_SSE;
+#endif
+#else
+ __m128d fix0_SSE,fiy0_SSE,fiz0_SSE;
+ __m128d fix2_SSE,fiy2_SSE,fiz2_SSE;
+#endif
+
+ /* AVX: use floating point masks, as there are no integer instructions */
+ gmx_mm_pr mask0 = _mm256_castsi256_ps(_mm256_set_epi32( 0x0080, 0x0040, 0x0020, 0x0010, 0x0008, 0x0004, 0x0002, 0x0001 ));
+ gmx_mm_pr mask2 = _mm256_castsi256_ps(_mm256_set_epi32( 0x8000, 0x4000, 0x2000, 0x1000, 0x0800, 0x0400, 0x0200, 0x0100 ));
+
+ gmx_mm_pr diag_SSE0 = _mm256_castsi256_ps( _mm256_set_epi32( 0xffffffff, 0xffffffff, 0x00000000, 0x00000000, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000 ));
+ gmx_mm_pr diag_SSE2 = _mm256_castsi256_ps( _mm256_set_epi32( 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0xffffffff, 0x00000000, 0x00000000, 0x00000000 ));
+
+#ifndef GMX_MM256_HERE
+ __m128i zeroi_SSE = _mm_setzero_si128();
+#endif
+#ifdef GMX_X86_SSE4_1
+ gmx_mm_pr zero_SSE = gmx_set1_pr(0);
+#endif
+
+ gmx_mm_pr one_SSE=gmx_set1_pr(1.0);
+ gmx_mm_pr iq_SSE0=gmx_setzero_pr();
+ gmx_mm_pr iq_SSE2=gmx_setzero_pr();
+ gmx_mm_pr mrc_3_SSE;
+#ifdef CALC_ENERGIES
+ gmx_mm_pr hrc_3_SSE,moh_rc_SSE;
+#endif
+
+#ifdef CALC_COUL_TAB
+ /* Coulomb table variables */
+ gmx_mm_pr invtsp_SSE;
+ const real *tab_coul_F;
+#ifndef TAB_FDV0
+ const real *tab_coul_V;
+#endif
+#ifdef GMX_MM256_HERE
+ int ti0_array[2*UNROLLJ-1],*ti0;
+ int ti2_array[2*UNROLLJ-1],*ti2;
+#endif
+#ifdef CALC_ENERGIES
+ gmx_mm_pr mhalfsp_SSE;
+#endif
+#endif
+
+#ifdef CALC_COUL_EWALD
+ gmx_mm_pr beta2_SSE,beta_SSE;
+#endif
+
+#if defined CALC_ENERGIES && (defined CALC_COUL_EWALD || defined CALC_COUL_TAB)
+ gmx_mm_pr sh_ewald_SSE;
+#endif
+
+#ifdef LJ_COMB_LB
+ const real *ljc;
+
+ gmx_mm_pr hsig_i_SSE0,seps_i_SSE0;
+ gmx_mm_pr hsig_i_SSE2,seps_i_SSE2;
+#else
+#ifdef FIX_LJ_C
+ real pvdw_array[2*UNROLLI*UNROLLJ+3];
+ real *pvdw_c6,*pvdw_c12;
+ gmx_mm_pr c6_SSE0,c12_SSE0;
+ gmx_mm_pr c6_SSE2,c12_SSE2;
+#endif
+
+#ifdef LJ_COMB_GEOM
+ const real *ljc;
+
+ gmx_mm_pr c6s_SSE0,c12s_SSE0;
+ gmx_mm_pr c6s_SSE1,c12s_SSE1;
+ gmx_mm_pr c6s_SSE2=gmx_setzero_pr(),c12s_SSE2=gmx_setzero_pr();
+ gmx_mm_pr c6s_SSE3=gmx_setzero_pr(),c12s_SSE3=gmx_setzero_pr();
+#endif
+#endif /* LJ_COMB_LB */
+
+ gmx_mm_pr vctotSSE,VvdwtotSSE;
+ gmx_mm_pr sixthSSE,twelvethSSE;
+
+ gmx_mm_pr avoid_sing_SSE;
+ gmx_mm_pr rc2_SSE;
+#ifdef VDW_CUTOFF_CHECK
+ gmx_mm_pr rcvdw2_SSE;
+#endif
+
+#ifdef CALC_ENERGIES
+ gmx_mm_pr sh_invrc6_SSE,sh_invrc12_SSE;
+
+ /* cppcheck-suppress unassignedVariable */
+ real tmpsum_array[15],*tmpsum;
+#endif
+#ifdef CALC_SHIFTFORCES
+ /* cppcheck-suppress unassignedVariable */
+ real shf_array[15],*shf;
+#endif
+
+ int ninner;
+
+#ifdef COUNT_PAIRS
+ int npair=0;
+#endif
+
+#if defined LJ_COMB_GEOM || defined LJ_COMB_LB
+ ljc = nbat->lj_comb;
+#else
+ /* No combination rule used */
+#ifndef GMX_DOUBLE
+ nbfp_ptr = nbat->nbfp_s4;
+#define NBFP_STRIDE 4
+#else
+ nbfp_ptr = nbat->nbfp;
+#define NBFP_STRIDE 2
+#endif
+ nbfp_stride = NBFP_STRIDE;
+#endif
+
+#ifdef CALC_COUL_TAB
+#ifdef GMX_MM256_HERE
+ /* Generate aligned table pointers */
+ ti0 = (int *)(((size_t)(ti0_array+UNROLLJ-1)) & (~((size_t)(UNROLLJ*sizeof(real)-1))));
+ ti2 = (int *)(((size_t)(ti2_array+UNROLLJ-1)) & (~((size_t)(UNROLLJ*sizeof(real)-1))));
+#endif
+
+ invtsp_SSE = gmx_set1_pr(ic->tabq_scale);
+#ifdef CALC_ENERGIES
+ mhalfsp_SSE = gmx_set1_pr(-0.5/ic->tabq_scale);
+#endif
+
+#ifdef TAB_FDV0
+ tab_coul_F = ic->tabq_coul_FDV0;
+#else
+ tab_coul_F = ic->tabq_coul_F;
+ tab_coul_V = ic->tabq_coul_V;
+#endif
+#endif /* CALC_COUL_TAB */
+
+#ifdef CALC_COUL_EWALD
+ beta2_SSE = gmx_set1_pr(ic->ewaldcoeff*ic->ewaldcoeff);
+ beta_SSE = gmx_set1_pr(ic->ewaldcoeff);
+#endif
+
+#if (defined CALC_COUL_TAB || defined CALC_COUL_EWALD) && defined CALC_ENERGIES
+ sh_ewald_SSE = gmx_set1_pr(ic->sh_ewald);
+#endif
+
+ q = nbat->q;
+ type = nbat->type;
+ facel = ic->epsfac;
+ shiftvec = shift_vec[0];
+ x = nbat->x;
+
+ avoid_sing_SSE = gmx_set1_pr(NBNXN_AVOID_SING_R2_INC);
+
+ /* The kernel either supports rcoulomb = rvdw or rcoulomb >= rvdw */
+ rc2_SSE = gmx_set1_pr(ic->rcoulomb*ic->rcoulomb);
+#ifdef VDW_CUTOFF_CHECK
+ rcvdw2_SSE = gmx_set1_pr(ic->rvdw*ic->rvdw);
+#endif
+
+#ifdef CALC_ENERGIES
+ sixthSSE = gmx_set1_pr(1.0/6.0);
+ twelvethSSE = gmx_set1_pr(1.0/12.0);
+
+ sh_invrc6_SSE = gmx_set1_pr(ic->sh_invrc6);
+ sh_invrc12_SSE = gmx_set1_pr(ic->sh_invrc6*ic->sh_invrc6);
+#endif
+
+ mrc_3_SSE = gmx_set1_pr(-2*ic->k_rf);
+
+#ifdef CALC_ENERGIES
+ hrc_3_SSE = gmx_set1_pr(ic->k_rf);
+
+ moh_rc_SSE = gmx_set1_pr(-ic->c_rf);
+#endif
+
+#ifdef CALC_ENERGIES
+ tmpsum = (real *)(((size_t)(tmpsum_array+7)) & (~((size_t)31)));
+#endif
+#ifdef CALC_SHIFTFORCES
+ shf = (real *)(((size_t)(shf_array+7)) & (~((size_t)31)));
+#endif
+
+#ifdef FIX_LJ_C
+ pvdw_c6 = (real *)(((size_t)(pvdw_array+3)) & (~((size_t)15)));
+ pvdw_c12 = pvdw_c6 + UNROLLI*UNROLLJ;
+
+ for(jp=0; jp<UNROLLJ; jp++)
+ {
+ pvdw_c6 [0*UNROLLJ+jp] = nbat->nbfp[0*2];
+ pvdw_c6 [1*UNROLLJ+jp] = nbat->nbfp[0*2];
+ pvdw_c6 [2*UNROLLJ+jp] = nbat->nbfp[0*2];
+ pvdw_c6 [3*UNROLLJ+jp] = nbat->nbfp[0*2];
+
+ pvdw_c12[0*UNROLLJ+jp] = nbat->nbfp[0*2+1];
+ pvdw_c12[1*UNROLLJ+jp] = nbat->nbfp[0*2+1];
+ pvdw_c12[2*UNROLLJ+jp] = nbat->nbfp[0*2+1];
+ pvdw_c12[3*UNROLLJ+jp] = nbat->nbfp[0*2+1];
+ }
+ c6_SSE0 = gmx_load_pr(pvdw_c6 +0*UNROLLJ);
+ c6_SSE1 = gmx_load_pr(pvdw_c6 +1*UNROLLJ);
+ c6_SSE2 = gmx_load_pr(pvdw_c6 +2*UNROLLJ);
+ c6_SSE3 = gmx_load_pr(pvdw_c6 +3*UNROLLJ);
+
+ c12_SSE0 = gmx_load_pr(pvdw_c12+0*UNROLLJ);
+ c12_SSE1 = gmx_load_pr(pvdw_c12+1*UNROLLJ);
+ c12_SSE2 = gmx_load_pr(pvdw_c12+2*UNROLLJ);
+ c12_SSE3 = gmx_load_pr(pvdw_c12+3*UNROLLJ);
+#endif /* FIX_LJ_C */
+
+#ifdef ENERGY_GROUPS
+ egps_ishift = nbat->neg_2log;
+ egps_imask = (1<<egps_ishift) - 1;
+ egps_jshift = 2*nbat->neg_2log;
+ egps_jmask = (1<<egps_jshift) - 1;
+ egps_jstride = (UNROLLJ>>1)*UNROLLJ;
+ /* Major division is over i-particles: divide nVS by 4 for i-stride */
+ Vstride_i = nbat->nenergrp*(1<<nbat->neg_2log)*egps_jstride;
+#endif
+
+ l_cj = nbl->cj;
+
+ ninner = 0;
+ for(n=0; n<nbl->nci; n++)
+ {
+ nbln = &nbl->ci[n];
+
+ ish = (nbln->shift & NBNXN_CI_SHIFT);
+ ish3 = ish*3;
+ cjind0 = nbln->cj_ind_start;
+ cjind1 = nbln->cj_ind_end;
+ /* Currently only works super-cells equal to sub-cells */
+ ci = nbln->ci;
+ ci_sh = (ish == CENTRAL ? ci : -1);
+
+ shX_SSE = gmx_load1_pr(shiftvec+ish3);
+ shY_SSE = gmx_load1_pr(shiftvec+ish3+1);
+ shZ_SSE = gmx_load1_pr(shiftvec+ish3+2);
+
+#if UNROLLJ <= 4
+ sci = ci*STRIDE;
+ scix = sci*DIM;
+ sci2 = sci*2;
+#else
+ sci = (ci>>1)*STRIDE;
+ scix = sci*DIM + (ci & 1)*(STRIDE>>1);
+ sci2 = sci*2 + (ci & 1)*(STRIDE>>1);
+ sci += (ci & 1)*(STRIDE>>1);
+#endif
+
+ half_LJ = (nbln->shift & NBNXN_CI_HALF_LJ(0));
+ do_coul = (nbln->shift & NBNXN_CI_DO_COUL(0));
+
+#ifdef ENERGY_GROUPS
+ egps_i = nbat->energrp[ci];
+ {
+ int ia,egp_ia;
+
+ for(ia=0; ia<UNROLLI; ia++)
+ {
+ egp_ia = (egps_i >> (ia*egps_ishift)) & egps_imask;
+ vvdwtp[ia] = Vvdw + egp_ia*Vstride_i;
+ vctp[ia] = Vc + egp_ia*Vstride_i;
+ }
+ }
+#endif
+#if defined CALC_ENERGIES
+#if UNROLLJ == 4
+ if (do_coul && l_cj[nbln->cj_ind_start].cj == ci_sh)
+#endif
+#if UNROLLJ == 2
+ if (do_coul && l_cj[nbln->cj_ind_start].cj == (ci_sh<<1))
+#endif
+#if UNROLLJ == 8
+ if (do_coul && l_cj[nbln->cj_ind_start].cj == (ci_sh>>1))
+#endif
+ {
+ int ia;
+ real Vc_sub_self;
+
+#ifdef CALC_COUL_RF
+ Vc_sub_self = 0.5*ic->c_rf;
+#endif
+#ifdef CALC_COUL_TAB
+#ifdef TAB_FDV0
+ Vc_sub_self = 0.5*tab_coul_F[2];
+#else
+ Vc_sub_self = 0.5*tab_coul_V[0];
+#endif
+#endif
+#ifdef CALC_COUL_EWALD
+ /* beta/sqrt(pi) */
+ Vc_sub_self = 0.5*ic->ewaldcoeff*M_2_SQRTPI;
+#endif
+
+ for(ia=0; ia<UNROLLI; ia++)
+ {
+ real qi;
+
+ qi = q[sci+ia];
+#ifdef ENERGY_GROUPS
+ vctp[ia][((egps_i>>(ia*egps_ishift)) & egps_imask)*egps_jstride]
+#else
+ Vc[0]
+#endif
+ -= facel*qi*qi*Vc_sub_self;
+ }
+ }
+#endif
+
+#define gmx_load2_hpr(x) _mm256_insertf128_ps(gmx_load1_pr(x),gmx_load1_hpr(x+1),1)
+
+ /* Load i atom data */
+ sciy = scix + STRIDE;
+ sciz = sciy + STRIDE;
+ ix_SSE0 = gmx_add_pr(gmx_load2_hpr(x+scix) ,shX_SSE);
+ ix_SSE2 = gmx_add_pr(gmx_load2_hpr(x+scix+2),shX_SSE);
+ iy_SSE0 = gmx_add_pr(gmx_load2_hpr(x+sciy) ,shY_SSE);
+ iy_SSE2 = gmx_add_pr(gmx_load2_hpr(x+sciy+2),shY_SSE);
+ iz_SSE0 = gmx_add_pr(gmx_load2_hpr(x+sciz) ,shZ_SSE);
+ iz_SSE2 = gmx_add_pr(gmx_load2_hpr(x+sciz+2),shZ_SSE);
+
+ /* With half_LJ we currently always calculate Coulomb interactions */
+ if (do_coul || half_LJ)
+ {
+ gmx_mm_pr facel_SSE;
+
+ facel_SSE = gmx_set1_pr(facel);
+
+ iq_SSE0 = gmx_mul_pr(facel_SSE,gmx_load2_hpr(q+sci));
+ iq_SSE2 = gmx_mul_pr(facel_SSE,gmx_load2_hpr(q+sci+2));
+ }
+
+#ifdef LJ_COMB_LB
+ hsig_i_SSE0 = gmx_load2_hpr(ljc+sci2+0);
+ hsig_i_SSE2 = gmx_load2_hpr(ljc+sci2+2);
+ seps_i_SSE0 = gmx_load2_hpr(ljc+sci2+STRIDE+0);
+ seps_i_SSE2 = gmx_load2_hpr(ljc+sci2+STRIDE+2);
+#else
+#ifdef LJ_COMB_GEOM
+ c6s_SSE0 = gmx_load2_hpr(ljc+sci2+0);
+ if (!half_LJ)
+ {
+ c6s_SSE2 = gmx_load2_hpr(ljc+sci2+2);
+ }
+ c12s_SSE0 = gmx_load2_hpr(ljc+sci2+STRIDE+0);
+ if (!half_LJ)
+ {
+ c12s_SSE2 = gmx_load2_hpr(ljc+sci2+STRIDE+2);
+ }
+#else
+ nbfp0 = nbfp_ptr + type[sci ]*nbat->ntype*nbfp_stride;
+ nbfp1 = nbfp_ptr + type[sci+1]*nbat->ntype*nbfp_stride;
+ if (!half_LJ)
+ {
+ nbfp2 = nbfp_ptr + type[sci+2]*nbat->ntype*nbfp_stride;
+ nbfp3 = nbfp_ptr + type[sci+3]*nbat->ntype*nbfp_stride;
+ }
+#endif
+#endif
+
+ /* Zero the potential energy for this list */
+ VvdwtotSSE = gmx_setzero_pr();
+ vctotSSE = gmx_setzero_pr();
+
+ /* Clear i atom forces */
+ fix_SSE0 = gmx_setzero_pr();
+ fix_SSE2 = gmx_setzero_pr();
+ fiy_SSE0 = gmx_setzero_pr();
+ fiy_SSE2 = gmx_setzero_pr();
+ fiz_SSE0 = gmx_setzero_pr();
+ fiz_SSE2 = gmx_setzero_pr();
+
+ cjind = cjind0;
+
+ /* Currently all kernels use (at least half) LJ */
+#define CALC_LJ
+ if (half_LJ)
+ {
+#define CALC_COULOMB
+#define HALF_LJ
+#define CHECK_EXCLS
+ while (cjind < cjind1 && nbl->cj[cjind].excl != SIMD_MASK_ALL)
+ {
+#include "nbnxn_kernel_simd_2xnn_inner.h"
+ cjind++;
+ }
+#undef CHECK_EXCLS
+ for(; (cjind<cjind1); cjind++)
+ {
+#include "nbnxn_kernel_simd_2xnn_inner.h"
+ }
+#undef HALF_LJ
+#undef CALC_COULOMB
+ }
+ else if (do_coul)
+ {
+#define CALC_COULOMB
+#define CHECK_EXCLS
+ while (cjind < cjind1 && nbl->cj[cjind].excl != SIMD_MASK_ALL)
+ {
+#include "nbnxn_kernel_simd_2xnn_inner.h"
+ cjind++;
+ }
+#undef CHECK_EXCLS
+ for(; (cjind<cjind1); cjind++)
+ {
+#include "nbnxn_kernel_simd_2xnn_inner.h"
+ }
+#undef CALC_COULOMB
+ }
+ else
+ {
+#define CHECK_EXCLS
+ while (cjind < cjind1 && nbl->cj[cjind].excl != SIMD_MASK_ALL)
+ {
+#include "nbnxn_kernel_simd_2xnn_inner.h"
+ cjind++;
+ }
+#undef CHECK_EXCLS
+ for(; (cjind<cjind1); cjind++)
+ {
+#include "nbnxn_kernel_simd_2xnn_inner.h"
+ }
+ }
+#undef CALC_LJ
+ ninner += cjind1 - cjind0;
+
+ /* Add accumulated i-forces to the force array */
+#if UNROLLJ >= 4
+#ifndef GMX_DOUBLE
+#define gmx_load_ps4 _mm_load_ps
+#define gmx_store_ps4 _mm_store_ps
+#define gmx_add_ps4 _mm_add_ps
+#else
+#define gmx_load_ps4 _mm256_load_pd
+#define gmx_store_ps4 _mm256_store_pd
+#define gmx_add_ps4 _mm256_add_pd
+#endif
+ GMX_MM_TRANSPOSE_SUM4H_PR(fix_SSE0,fix_SSE2,fix_SSE);
+ gmx_store_ps4(f+scix, gmx_add_ps4(fix_SSE, gmx_load_ps4(f+scix)));
+
+ GMX_MM_TRANSPOSE_SUM4H_PR(fiy_SSE0,fiy_SSE2,fiy_SSE);
+ gmx_store_ps4(f+sciy, gmx_add_ps4(fiy_SSE, gmx_load_ps4(f+sciy)));
+
+ GMX_MM_TRANSPOSE_SUM4H_PR(fiz_SSE0,fiz_SSE2,fiz_SSE);
+ gmx_store_ps4(f+sciz, gmx_add_ps4(fiz_SSE, gmx_load_ps4(f+sciz)));
+
+#ifdef CALC_SHIFTFORCES
+ gmx_store_ps4(shf,fix_SSE);
+ fshift[ish3+0] += SUM_SIMD4(shf);
+ gmx_store_ps4(shf,fiy_SSE);
+ fshift[ish3+1] += SUM_SIMD4(shf);
+ gmx_store_ps4(shf,fiz_SSE);
+ fshift[ish3+2] += SUM_SIMD4(shf);
+#endif
+#else
+ GMX_MM_TRANSPOSE_SUM2_PD(fix_SSE0,fix_SSE1,fix0_SSE);
+ _mm_store_pd(f+scix, _mm_add_pd(fix0_SSE, _mm_load_pd(f+scix)));
+ GMX_MM_TRANSPOSE_SUM2_PD(fix_SSE2,fix_SSE3,fix2_SSE);
+ _mm_store_pd(f+scix+2, _mm_add_pd(fix2_SSE, _mm_load_pd(f+scix+2)));
+
+ GMX_MM_TRANSPOSE_SUM2_PD(fiy_SSE0,fiy_SSE1,fiy0_SSE);
+ _mm_store_pd(f+sciy, _mm_add_pd(fiy0_SSE, _mm_load_pd(f+sciy)));
+ GMX_MM_TRANSPOSE_SUM2_PD(fiy_SSE2,fiy_SSE3,fiy2_SSE);
+ _mm_store_pd(f+sciy+2, _mm_add_pd(fiy2_SSE, _mm_load_pd(f+sciy+2)));
+
+ GMX_MM_TRANSPOSE_SUM2_PD(fiz_SSE0,fiz_SSE1,fiz0_SSE);
+ _mm_store_pd(f+sciz, _mm_add_pd(fiz0_SSE, _mm_load_pd(f+sciz)));
+ GMX_MM_TRANSPOSE_SUM2_PD(fiz_SSE2,fiz_SSE3,fiz2_SSE);
+ _mm_store_pd(f+sciz+2, _mm_add_pd(fiz2_SSE, _mm_load_pd(f+sciz+2)));
+
+#ifdef CALC_SHIFTFORCES
+ _mm_store_pd(shf,_mm_add_pd(fix0_SSE,fix2_SSE));
+ fshift[ish3+0] += shf[0] + shf[1];
+ _mm_store_pd(shf,_mm_add_pd(fiy0_SSE,fiy2_SSE));
+ fshift[ish3+1] += shf[0] + shf[1];
+ _mm_store_pd(shf,_mm_add_pd(fiz0_SSE,fiz2_SSE));
+ fshift[ish3+2] += shf[0] + shf[1];
+#endif
+#endif
+
+#ifdef CALC_ENERGIES
+ if (do_coul)
+ {
+ gmx_store_pr(tmpsum,vctotSSE);
+ *Vc += SUM_SIMD(tmpsum);
+ }
+
+ gmx_store_pr(tmpsum,VvdwtotSSE);
+ *Vvdw += SUM_SIMD(tmpsum);
+#endif
+
+ /* Outer loop uses 6 flops/iteration */
+ }
+
+#ifdef COUNT_PAIRS
+ printf("atom pairs %d\n",npair);
+#endif
+}
+
+#undef gmx_load2_hpr
+
+#undef gmx_load_ps4
+#undef gmx_store_ps4
+#undef gmx_store_ps4
+
+#undef CALC_SHIFTFORCES
+
+#undef UNROLLI
+#undef UNROLLJ
+#undef STRIDE
+#undef TAB_FDV0
+#undef NBFP_STRIDE
#include "../nbnxn_consts.h"
#include "nbnxn_kernel_common.h"
-#ifdef GMX_X86_AVX_256
+#ifdef GMX_NBNXN_SIMD_4XN
-#include "nbnxn_kernel_x86_simd256.h"
+#include "nbnxn_kernel_simd_4xn.h"
-/* Include all flavors of the 256-bit AVX kernel loops */
+/* Include all flavors of the SSE or AVX 4xN kernel loops */
+#if GMX_NBNXN_SIMD_BITWIDTH == 128
+#define GMX_MM128_HERE
+#else
+#if GMX_NBNXN_SIMD_BITWIDTH == 256
#define GMX_MM256_HERE
+#else
+#error "unsupported GMX_NBNXN_SIMD_BITWIDTH"
+#endif
+#endif
/* Analytical reaction-field kernels */
#define CALC_COUL_RF
-#include "nbnxn_kernel_x86_simd_includes.h"
+#include "nbnxn_kernel_simd_4xn_includes.h"
#undef CALC_COUL_RF
#define CALC_COUL_TAB
/* Single cut-off: rcoulomb = rvdw */
-#include "nbnxn_kernel_x86_simd_includes.h"
+#include "nbnxn_kernel_simd_4xn_includes.h"
/* Twin cut-off: rcoulomb >= rvdw */
#define VDW_CUTOFF_CHECK
-#include "nbnxn_kernel_x86_simd_includes.h"
+#include "nbnxn_kernel_simd_4xn_includes.h"
#undef VDW_CUTOFF_CHECK
#undef CALC_COUL_TAB
#define CALC_COUL_EWALD
/* Single cut-off: rcoulomb = rvdw */
-#include "nbnxn_kernel_x86_simd_includes.h"
+#include "nbnxn_kernel_simd_4xn_includes.h"
/* Twin cut-off: rcoulomb >= rvdw */
#define VDW_CUTOFF_CHECK
-#include "nbnxn_kernel_x86_simd_includes.h"
+#include "nbnxn_kernel_simd_4xn_includes.h"
#undef VDW_CUTOFF_CHECK
#undef CALC_COUL_EWALD
enum { coultRF, coultTAB, coultTAB_TWIN, coultEWALD, coultEWALD_TWIN, coultNR };
-#define NBK_FN(elec,ljcomb) nbnxn_kernel_x86_simd256_##elec##_comb_##ljcomb##_ener
+#define NBK_FN(elec,ljcomb) nbnxn_kernel_simd_4xn_##elec##_comb_##ljcomb##_ener
static p_nbk_func_ener p_nbk_ener[coultNR][ljcrNR] =
{ { NBK_FN(rf ,geom), NBK_FN(rf ,lb), NBK_FN(rf ,none) },
{ NBK_FN(tab ,geom), NBK_FN(tab ,lb), NBK_FN(tab ,none) },
{ NBK_FN(ewald_twin,geom), NBK_FN(ewald_twin,lb), NBK_FN(ewald_twin,none) } };
#undef NBK_FN
-#define NBK_FN(elec,ljcomb) nbnxn_kernel_x86_simd256_##elec##_comb_##ljcomb##_energrp
+#define NBK_FN(elec,ljcomb) nbnxn_kernel_simd_4xn_##elec##_comb_##ljcomb##_energrp
static p_nbk_func_ener p_nbk_energrp[coultNR][ljcrNR] =
{ { NBK_FN(rf ,geom), NBK_FN(rf ,lb), NBK_FN(rf ,none) },
{ NBK_FN(tab ,geom), NBK_FN(tab ,lb), NBK_FN(tab ,none) },
{ NBK_FN(ewald_twin,geom), NBK_FN(ewald_twin,lb), NBK_FN(ewald_twin,none) } };
#undef NBK_FN
-#define NBK_FN(elec,ljcomb) nbnxn_kernel_x86_simd256_##elec##_comb_##ljcomb##_noener
+#define NBK_FN(elec,ljcomb) nbnxn_kernel_simd_4xn_##elec##_comb_##ljcomb##_noener
static p_nbk_func_noener p_nbk_noener[coultNR][ljcrNR] =
{ { NBK_FN(rf ,geom), NBK_FN(rf ,lb), NBK_FN(rf ,none) },
{ NBK_FN(tab ,geom), NBK_FN(tab ,lb), NBK_FN(tab ,none) },
const real *VSvdw,const real *VSc,
real *Vvdw,real *Vc)
{
+ const int simd_width = GMX_SIMD_WIDTH_HERE;
+ const int unrollj_half = GMX_SIMD_WIDTH_HERE/2;
int ng_p2,i,j,j0,j1,c,s;
-#define SIMD_WIDTH (GMX_X86_SIMD_WIDTH_HERE)
-#define SIMD_WIDTH_HALF (GMX_X86_SIMD_WIDTH_HERE/2)
-
ng_p2 = (1<<ng_2log);
/* The size of the x86 SIMD energy group buffer array is:
- * ng*ng*ng_p2*SIMD_WIDTH_HALF*SIMD_WIDTH
+ * ng*ng*ng_p2*unrollj_half*simd_width
*/
for(i=0; i<ng; i++)
{
{
for(j0=0; j0<ng; j0++)
{
- c = ((i*ng + j1)*ng_p2 + j0)*SIMD_WIDTH_HALF*SIMD_WIDTH;
- for(s=0; s<SIMD_WIDTH_HALF; s++)
+ c = ((i*ng + j1)*ng_p2 + j0)*unrollj_half*simd_width;
+ for(s=0; s<unrollj_half; s++)
{
Vvdw[i*ng+j0] += VSvdw[c+0];
Vvdw[i*ng+j1] += VSvdw[c+1];
Vc [i*ng+j0] += VSc [c+0];
Vc [i*ng+j1] += VSc [c+1];
- c += SIMD_WIDTH + 2;
+ c += simd_width + 2;
}
}
}
}
}
-#endif /* GMX_X86_AVX_256 */
+#endif /* GMX_NBNXN_SIMD_4XN */
void
-nbnxn_kernel_x86_simd256(nbnxn_pairlist_set_t *nbl_list,
- const nbnxn_atomdata_t *nbat,
- const interaction_const_t *ic,
- int ewald_excl,
- rvec *shift_vec,
- int force_flags,
- int clearF,
- real *fshift,
- real *Vc,
- real *Vvdw)
-#ifdef GMX_X86_AVX_256
+nbnxn_kernel_simd_4xn(nbnxn_pairlist_set_t *nbl_list,
+ const nbnxn_atomdata_t *nbat,
+ const interaction_const_t *ic,
+ int ewald_excl,
+ rvec *shift_vec,
+ int force_flags,
+ int clearF,
+ real *fshift,
+ real *Vc,
+ real *Vvdw)
+#ifdef GMX_NBNXN_SIMD_4XN
{
int nnbl;
nbnxn_pairlist_t **nbl;
}
#else
{
- gmx_incons("nbnxn_kernel_x86_simd256 called while GROMACS was configured without AVX enabled");
+ gmx_incons("nbnxn_kernel_simd_4xn called while GROMACS was configured without 4xN SIMD kernels enabled");
}
#endif
* To help us fund GROMACS development, we humbly ask that you cite
* the research papers on the package. Check out http://www.gromacs.org.
*/
-#ifndef _nbnxn_kernel_x86_simd256_h
-#define _nbnxn_kernel_x86_simd256_h
+#ifndef _nbnxn_kernel_simd_4xn_h
+#define _nbnxn_kernel_simd_4xn_h
#include "typedefs.h"
extern "C" {
#endif
-/* Wrapper call for the non-bonded cluster vs cluster kernels */
+/* Wrapper call for the non-bonded cluster vs cluster kernels.
+ * These kernels determine 4xN cluster interactions for SIMD width N.
+ */
void
-nbnxn_kernel_x86_simd256(nbnxn_pairlist_set_t *nbl_list,
- const nbnxn_atomdata_t *nbat,
- const interaction_const_t *ic,
- int ewald_excl,
- rvec *shift_vec,
- int force_flags,
- int clearF,
- real *fshift,
- real *Vc,
- real *Vvdw);
+nbnxn_kernel_simd_4xn(nbnxn_pairlist_set_t *nbl_list,
+ const nbnxn_atomdata_t *nbat,
+ const interaction_const_t *ic,
+ int ewald_excl,
+ rvec *shift_vec,
+ int force_flags,
+ int clearF,
+ real *fshift,
+ real *Vc,
+ real *Vvdw);
#ifdef __cplusplus
}
/* Include the force+energy kernels */
#define CALC_ENERGIES
#define LJ_COMB_GEOM
-#include "nbnxn_kernel_x86_simd_outer.h"
+#include "nbnxn_kernel_simd_4xn_outer.h"
#undef LJ_COMB_GEOM
#define LJ_COMB_LB
-#include "nbnxn_kernel_x86_simd_outer.h"
+#include "nbnxn_kernel_simd_4xn_outer.h"
#undef LJ_COMB_LB
-#include "nbnxn_kernel_x86_simd_outer.h"
+#include "nbnxn_kernel_simd_4xn_outer.h"
#undef CALC_ENERGIES
/* Include the force+energygroups kernels */
#define CALC_ENERGIES
#define ENERGY_GROUPS
#define LJ_COMB_GEOM
-#include "nbnxn_kernel_x86_simd_outer.h"
+#include "nbnxn_kernel_simd_4xn_outer.h"
#undef LJ_COMB_GEOM
#define LJ_COMB_LB
-#include "nbnxn_kernel_x86_simd_outer.h"
+#include "nbnxn_kernel_simd_4xn_outer.h"
#undef LJ_COMB_LB
-#include "nbnxn_kernel_x86_simd_outer.h"
+#include "nbnxn_kernel_simd_4xn_outer.h"
#undef ENERGY_GROUPS
#undef CALC_ENERGIES
/* Include the force only kernels */
#define LJ_COMB_GEOM
-#include "nbnxn_kernel_x86_simd_outer.h"
+#include "nbnxn_kernel_simd_4xn_outer.h"
#undef LJ_COMB_GEOM
#define LJ_COMB_LB
-#include "nbnxn_kernel_x86_simd_outer.h"
+#include "nbnxn_kernel_simd_4xn_outer.h"
#undef LJ_COMB_LB
-#include "nbnxn_kernel_x86_simd_outer.h"
+#include "nbnxn_kernel_simd_4xn_outer.h"
* the research papers on the package. Check out http://www.gromacs.org.
*/
-/* This is the innermost loop contents for the n vs n atom
- * SSE2 single precision kernels.
+/* This is the innermost loop contents for the 4 x N atom SIMD kernel.
+ * This flavor of the kernel calculates interactions of 4 i-atoms
+ * with N j-atoms stored in N wide SIMD registers.
*/
gmx_mm_pr r_SSE1,rs_SSE1,rf_SSE1,frac_SSE1;
gmx_mm_pr r_SSE2,rs_SSE2,rf_SSE2,frac_SSE2;
gmx_mm_pr r_SSE3,rs_SSE3,rf_SSE3,frac_SSE3;
- /* Table index: rs converted to an int */
+ /* Table index: rs truncated to an int */
#if !(defined GMX_MM256_HERE && defined GMX_DOUBLE)
gmx_epi32 ti_SSE0,ti_SSE1,ti_SSE2,ti_SSE3;
#else
jxSSE = gmx_load_pr(x+ajx);
jySSE = gmx_load_pr(x+ajy);
jzSSE = gmx_load_pr(x+ajz);
-
+
/* Calculate distance */
dx_SSE0 = gmx_sub_pr(ix_SSE0,jxSSE);
dy_SSE0 = gmx_sub_pr(iy_SSE0,jySSE);
dx_SSE3 = gmx_sub_pr(ix_SSE3,jxSSE);
dy_SSE3 = gmx_sub_pr(iy_SSE3,jySSE);
dz_SSE3 = gmx_sub_pr(iz_SSE3,jzSSE);
-
+
/* rsq = dx*dx+dy*dy+dz*dz */
rsq_SSE0 = gmx_calc_rsq_pr(dx_SSE0,dy_SSE0,dz_SSE0);
rsq_SSE1 = gmx_calc_rsq_pr(dx_SSE1,dy_SSE1,dz_SSE1);
*/
/* GMX_MM128_HERE or GMX_MM256_HERE should be set before including this file */
-#include "gmx_x86_simd_macros.h"
+#include "gmx_simd_macros.h"
#define SUM_SIMD4(x) (x[0]+x[1]+x[2]+x[3])
#define UNROLLI NBNXN_CPU_CLUSTER_I_SIZE
-#define UNROLLJ GMX_X86_SIMD_WIDTH_HERE
+#define UNROLLJ GMX_SIMD_WIDTH_HERE
#if defined GMX_MM128_HERE || defined GMX_DOUBLE
#define STRIDE 4
#define SIMD_MASK_ALL 0xffffffff
-#include "nbnxn_kernel_x86_simd_utils.h"
+#include "nbnxn_kernel_simd_utils.h"
/* All functionality defines are set here, except for:
* CALC_ENERGIES, ENERGY_GROUPS which are defined before.
/* Assumes all LJ parameters are identical */
/* #define FIX_LJ_C */
-#define NBK_FUNC_NAME_C_LJC(b,s,c,ljc,e) b##_##s##_##c##_comb_##ljc##_##e
+/* The NBK_FUNC_NAME... macros below generate the whole zoo of kernels names
+ * with all combinations off electrostatics (coul), LJ combination rules (ljc)
+ * and energy calculations (ene), depending on the defines set.
+ */
+
+#define NBK_FUNC_NAME_C_LJC(base,coul,ljc,ene) base##_##coul##_comb_##ljc##_##ene
#if defined LJ_COMB_GEOM
-#define NBK_FUNC_NAME_C(b,s,c,e) NBK_FUNC_NAME_C_LJC(b,s,c,geom,e)
+#define NBK_FUNC_NAME_C(base,coul,ene) NBK_FUNC_NAME_C_LJC(base,coul,geom,ene)
#else
#if defined LJ_COMB_LB
-#define NBK_FUNC_NAME_C(b,s,c,e) NBK_FUNC_NAME_C_LJC(b,s,c,lb,e)
+#define NBK_FUNC_NAME_C(base,coul,ene) NBK_FUNC_NAME_C_LJC(base,coul,lb,ene)
#else
-#define NBK_FUNC_NAME_C(b,s,c,e) NBK_FUNC_NAME_C_LJC(b,s,c,none,e)
+#define NBK_FUNC_NAME_C(base,coul,ene) NBK_FUNC_NAME_C_LJC(base,coul,none,ene)
#endif
#endif
#ifdef CALC_COUL_RF
-#define NBK_FUNC_NAME(b,s,e) NBK_FUNC_NAME_C(b,s,rf,e)
+#define NBK_FUNC_NAME(base,ene) NBK_FUNC_NAME_C(base,rf,ene)
#endif
#ifdef CALC_COUL_TAB
#ifndef VDW_CUTOFF_CHECK
-#define NBK_FUNC_NAME(b,s,e) NBK_FUNC_NAME_C(b,s,tab,e)
+#define NBK_FUNC_NAME(base,ene) NBK_FUNC_NAME_C(base,tab,ene)
#else
-#define NBK_FUNC_NAME(b,s,e) NBK_FUNC_NAME_C(b,s,tab_twin,e)
+#define NBK_FUNC_NAME(base,ene) NBK_FUNC_NAME_C(base,tab_twin,ene)
#endif
#endif
#ifdef CALC_COUL_EWALD
#ifndef VDW_CUTOFF_CHECK
-#define NBK_FUNC_NAME(b,s,e) NBK_FUNC_NAME_C(b,s,ewald,e)
+#define NBK_FUNC_NAME(base,ene) NBK_FUNC_NAME_C(base,ewald,ene)
#else
-#define NBK_FUNC_NAME(b,s,e) NBK_FUNC_NAME_C(b,s,ewald_twin,e)
+#define NBK_FUNC_NAME(base,ene) NBK_FUNC_NAME_C(base,ewald_twin,ene)
#endif
#endif
-#ifdef GMX_MM128_HERE
-#define NBK_FUNC_NAME_S128_OR_S256(b,e) NBK_FUNC_NAME(b,x86_simd128,e)
-#endif
-#ifdef GMX_MM256_HERE
-#define NBK_FUNC_NAME_S128_OR_S256(b,e) NBK_FUNC_NAME(b,x86_simd256,e)
-#endif
-
static void
#ifndef CALC_ENERGIES
-NBK_FUNC_NAME_S128_OR_S256(nbnxn_kernel,noener)
+NBK_FUNC_NAME(nbnxn_kernel_simd_4xn,noener)
#else
#ifndef ENERGY_GROUPS
-NBK_FUNC_NAME_S128_OR_S256(nbnxn_kernel,ener)
+NBK_FUNC_NAME(nbnxn_kernel_simd_4xn,ener)
#else
-NBK_FUNC_NAME_S128_OR_S256(nbnxn_kernel,energrp)
+NBK_FUNC_NAME(nbnxn_kernel_simd_4xn,energrp)
#endif
#endif
#undef NBK_FUNC_NAME
}
#endif
- /* Load i atom data */
+ /* Load i atom data */
sciy = scix + STRIDE;
sciz = sciy + STRIDE;
ix_SSE0 = gmx_add_pr(gmx_load1_pr(x+scix) ,shX_SSE);
#define CHECK_EXCLS
while (cjind < cjind1 && nbl->cj[cjind].excl != SIMD_MASK_ALL)
{
-#include "nbnxn_kernel_x86_simd_inner.h"
+#include "nbnxn_kernel_simd_4xn_inner.h"
cjind++;
}
#undef CHECK_EXCLS
for(; (cjind<cjind1); cjind++)
{
-#include "nbnxn_kernel_x86_simd_inner.h"
+#include "nbnxn_kernel_simd_4xn_inner.h"
}
#undef HALF_LJ
#undef CALC_COULOMB
#define CHECK_EXCLS
while (cjind < cjind1 && nbl->cj[cjind].excl != SIMD_MASK_ALL)
{
-#include "nbnxn_kernel_x86_simd_inner.h"
+#include "nbnxn_kernel_simd_4xn_inner.h"
cjind++;
}
#undef CHECK_EXCLS
for(; (cjind<cjind1); cjind++)
{
-#include "nbnxn_kernel_x86_simd_inner.h"
+#include "nbnxn_kernel_simd_4xn_inner.h"
}
#undef CALC_COULOMB
}
#define CHECK_EXCLS
while (cjind < cjind1 && nbl->cj[cjind].excl != SIMD_MASK_ALL)
{
-#include "nbnxn_kernel_x86_simd_inner.h"
+#include "nbnxn_kernel_simd_4xn_inner.h"
cjind++;
}
#undef CHECK_EXCLS
for(; (cjind<cjind1); cjind++)
{
-#include "nbnxn_kernel_x86_simd_inner.h"
+#include "nbnxn_kernel_simd_4xn_inner.h"
}
}
#undef CALC_LJ
i_SSE1 = _mm256_hadd_ps(i_SSE0,i_SSE2); \
o_SSE = _mm_add_ps(_mm256_castps256_ps128(i_SSE1),_mm256_extractf128_ps(i_SSE1,1)); \
}
+#define GMX_MM_TRANSPOSE_SUM4H_PR(i_SSE0,i_SSE2,o_SSE) \
+{ \
+ i_SSE0 = _mm256_hadd_ps(i_SSE0,_mm256_setzero_ps()); \
+ i_SSE2 = _mm256_hadd_ps(i_SSE2,_mm256_setzero_ps()); \
+ i_SSE0 = _mm256_hadd_ps(i_SSE0,i_SSE2); \
+ i_SSE2 = _mm256_permute_ps(i_SSE0,0b10110001); \
+ o_SSE = _mm_add_ps(_mm256_castps256_ps128(i_SSE0),_mm256_extractf128_ps(i_SSE2,1)); \
+}
#else
#define GMX_MM_TRANSPOSE_SUM4_PR(i_SSE0,i_SSE1,i_SSE2,i_SSE3,o_SSE) \
{ \
GMX_2_MM_TO_M256(c12t_SSE[0],c12t_SSE[1],c12_SSE); \
}
+#define load_lj_pair_params2(nbfp,type,aj,c6_SSE,c12_SSE) \
+{ \
+ __m128 clj_SSE[2*UNROLLJ],c6t_SSE[2],c12t_SSE[2]; \
+ int p; \
+ \
+ for(p=0; p<2*UNROLLJ; p++) \
+ { \
+ /* Here we load 4 aligned floats, but we need just 2 */ \
+ clj_SSE[p] = _mm_load_ps(nbfp+type[aj+p]*NBFP_STRIDE); \
+ } \
+ GMX_MM_SHUFFLE_4_PS_FIL01_TO_2_PS(clj_SSE[0],clj_SSE[1],clj_SSE[2],clj_SSE[3],c6t_SSE[0],c12t_SSE[0]); \
+ GMX_MM_SHUFFLE_4_PS_FIL01_TO_2_PS(clj_SSE[4],clj_SSE[5],clj_SSE[6],clj_SSE[7],c6t_SSE[1],c12t_SSE[1]); \
+ \
+ GMX_2_MM_TO_M256(c6t_SSE[0],c6t_SSE[1],c6_SSE); \
+ GMX_2_MM_TO_M256(c12t_SSE[0],c12t_SSE[1],c12_SSE); \
+}
+
#endif
#if defined GMX_MM128_HERE && defined GMX_DOUBLE
/* Add energy register to possibly multiple terms in the energy array.
* This function is the same for SSE/AVX single/double.
*/
-static inline void add_ener_grp(gmx_mm_pr e_SSE,real *v,int *offset_jj)
+static inline void add_ener_grp(gmx_mm_pr e_SSE,real *v,const int *offset_jj)
{
int jj;
{
gmx_mm_pr v_SSE;
- v_SSE = gmx_load_pr(v+offset_jj[jj]+jj*UNROLLJ);
- gmx_store_pr(v+offset_jj[jj]+jj*UNROLLJ,gmx_add_pr(v_SSE,e_SSE));
+ v_SSE = gmx_load_pr(v+offset_jj[jj]+jj*GMX_SIMD_WIDTH_HERE);
+ gmx_store_pr(v+offset_jj[jj]+jj*GMX_SIMD_WIDTH_HERE,gmx_add_pr(v_SSE,e_SSE));
}
}
+#if defined GMX_X86_AVX_256 && GMX_SIMD_WIDTH_HERE == 8
+/* As add_ener_grp above, but for two groups of UNROLLJ/2 stored in
+ * a single SIMD register.
+ */
+static inline void add_ener_grp_halves(gmx_mm_pr e_SSE,
+ real *v0,real *v1,const int *offset_jj)
+{
+ gmx_mm_hpr e_SSE0,e_SSE1;
+ int jj;
+
+ e_SSE0 = _mm256_extractf128_ps(e_SSE,0);
+ e_SSE1 = _mm256_extractf128_ps(e_SSE,1);
+
+ for(jj=0; jj<(UNROLLJ/2); jj++)
+ {
+ gmx_mm_hpr v_SSE;
+
+ v_SSE = gmx_load_hpr(v0+offset_jj[jj]+jj*GMX_SIMD_WIDTH_HERE/2);
+ gmx_store_hpr(v0+offset_jj[jj]+jj*GMX_SIMD_WIDTH_HERE/2,gmx_add_hpr(v_SSE,e_SSE0));
+ }
+ for(jj=0; jj<(UNROLLJ/2); jj++)
+ {
+ gmx_mm_hpr v_SSE;
+
+ v_SSE = gmx_load_hpr(v1+offset_jj[jj]+jj*GMX_SIMD_WIDTH_HERE/2);
+ gmx_store_hpr(v1+offset_jj[jj]+jj*GMX_SIMD_WIDTH_HERE/2,gmx_add_hpr(v_SSE,e_SSE1));
+ }
+}
+#endif
+
#endif /* _nbnxn_kernel_sse_utils_h_ */
#ifndef GMX_DOUBLE
#define NBNXN_SEARCH_SSE_SINGLE
-#include "gmx_x86_simd_single.h"
-#else
-#include "gmx_x86_simd_double.h"
#endif
+/* Include basic SSE2 stuff */
+#include <emmintrin.h>
+
#if defined NBNXN_SEARCH_SSE_SINGLE && GPU_NSUBCELL == 8
#define NBNXN_8BB_SSE
#endif
#define STRIDE_8BB 4
#define STRIDE_8BB_2LOG 2
+#endif /* NBNXN_SEARCH_SSE */
+
+#ifdef GMX_NBNXN_SIMD
/* The functions below are macros as they are performance sensitive */
#define X_IND_CJ_J8(cj) ((cj)*STRIDE_P8)
/* The j-cluster size is matched to the SIMD width */
-#ifndef GMX_DOUBLE
-/* 128 bits can hold 4 floats */
-#define CI_TO_CJ_S128(ci) CI_TO_CJ_J4(ci)
-#define X_IND_CI_S128(ci) X_IND_CI_J4(ci)
-#define X_IND_CJ_S128(cj) X_IND_CJ_J4(cj)
-/* 256 bits can hold 8 floats */
-#define CI_TO_CJ_S256(ci) CI_TO_CJ_J8(ci)
-#define X_IND_CI_S256(ci) X_IND_CI_J8(ci)
-#define X_IND_CJ_S256(cj) X_IND_CJ_J8(cj)
+#if GMX_NBNXN_SIMD_BITWIDTH == 128
+#ifdef GMX_DOUBLE
+#define CI_TO_CJ_SIMD_4XN(ci) CI_TO_CJ_J2(ci)
+#define X_IND_CI_SIMD_4XN(ci) X_IND_CI_J2(ci)
+#define X_IND_CJ_SIMD_4XN(cj) X_IND_CJ_J2(cj)
+#else
+#define CI_TO_CJ_SIMD_4XN(ci) CI_TO_CJ_J4(ci)
+#define X_IND_CI_SIMD_4XN(ci) X_IND_CI_J4(ci)
+#define X_IND_CJ_SIMD_4XN(cj) X_IND_CJ_J4(cj)
+#endif
+#else
+#if GMX_NBNXN_SIMD_BITWIDTH == 256
+#ifdef GMX_DOUBLE
+#define CI_TO_CJ_SIMD_4XN(ci) CI_TO_CJ_J4(ci)
+#define X_IND_CI_SIMD_4XN(ci) X_IND_CI_J4(ci)
+#define X_IND_CJ_SIMD_4XN(cj) X_IND_CJ_J4(cj)
+#else
+#define CI_TO_CJ_SIMD_4XN(ci) CI_TO_CJ_J8(ci)
+#define X_IND_CI_SIMD_4XN(ci) X_IND_CI_J8(ci)
+#define X_IND_CJ_SIMD_4XN(cj) X_IND_CJ_J8(cj)
+/* Half SIMD with j-cluster size */
+#define CI_TO_CJ_SIMD_2XNN(ci) CI_TO_CJ_J4(ci)
+#define X_IND_CI_SIMD_2XNN(ci) X_IND_CI_J4(ci)
+#define X_IND_CJ_SIMD_2XNN(cj) X_IND_CJ_J4(cj)
+#endif
#else
-/* 128 bits can hold 2 doubles */
-#define CI_TO_CJ_S128(ci) CI_TO_CJ_J2(ci)
-#define X_IND_CI_S128(ci) X_IND_CI_J2(ci)
-#define X_IND_CJ_S128(cj) X_IND_CJ_J2(cj)
-/* 256 bits can hold 4 doubles */
-#define CI_TO_CJ_S256(ci) CI_TO_CJ_J4(ci)
-#define X_IND_CI_S256(ci) X_IND_CI_J4(ci)
-#define X_IND_CJ_S256(cj) X_IND_CJ_J4(cj)
+#error "unsupported GMX_NBNXN_SIMD_WIDTH"
+#endif
#endif
-#endif /* NBNXN_SEARCH_SSE */
+#endif /* GMX_NBNXN_SIMD */
/* Interaction masks for 4xN atom interactions.
{
switch (nb_kernel_type)
{
- case nbk4x4_PlainC:
- case nbk4xN_X86_SIMD128:
- case nbk4xN_X86_SIMD256:
+ case nbnxnk4x4_PlainC:
+ case nbnxnk4xN_SIMD_4xN:
+ case nbnxnk4xN_SIMD_2xNN:
return NBNXN_CPU_CLUSTER_I_SIZE;
- case nbk8x8x8_CUDA:
- case nbk8x8x8_PlainC:
+ case nbnxnk8x8x8_CUDA:
+ case nbnxnk8x8x8_PlainC:
/* The cluster size for super/sub lists is only set here.
* Any value should work for the pair-search and atomdata code.
* The kernels, of course, might require a particular value.
int nbnxn_kernel_to_cj_size(int nb_kernel_type)
{
+ int nbnxn_simd_width=0;
+ int cj_size=0;
+
+#ifdef GMX_NBNXN_SIMD
+ nbnxn_simd_width = GMX_NBNXN_SIMD_BITWIDTH/(sizeof(real)*8);
+#endif
+
switch (nb_kernel_type)
{
- case nbk4x4_PlainC:
- return NBNXN_CPU_CLUSTER_I_SIZE;
- case nbk4xN_X86_SIMD128:
- /* Number of reals that fit in SIMD (128 bits = 16 bytes) */
- return 16/sizeof(real);
- case nbk4xN_X86_SIMD256:
- /* Number of reals that fit in SIMD (256 bits = 32 bytes) */
- return 32/sizeof(real);
- case nbk8x8x8_CUDA:
- case nbk8x8x8_PlainC:
- return nbnxn_kernel_to_ci_size(nb_kernel_type);
+ case nbnxnk4x4_PlainC:
+ cj_size = NBNXN_CPU_CLUSTER_I_SIZE;
+ break;
+ case nbnxnk4xN_SIMD_4xN:
+ cj_size = nbnxn_simd_width;
+ break;
+ case nbnxnk4xN_SIMD_2xNN:
+ cj_size = nbnxn_simd_width/2;
+ break;
+ case nbnxnk8x8x8_CUDA:
+ case nbnxnk8x8x8_PlainC:
+ cj_size = nbnxn_kernel_to_ci_size(nb_kernel_type);
+ break;
default:
gmx_incons("unknown kernel type");
}
- return 0;
+ return cj_size;
}
static int ci_to_cj(int na_cj_2log,int ci)
gmx_bool nbnxn_kernel_pairlist_simple(int nb_kernel_type)
{
- if (nb_kernel_type == nbkNotSet)
+ if (nb_kernel_type == nbnxnkNotSet)
{
gmx_fatal(FARGS, "Non-bonded kernel type not set for Verlet-style pair-list.");
}
switch (nb_kernel_type)
{
- case nbk8x8x8_CUDA:
- case nbk8x8x8_PlainC:
+ case nbnxnk8x8x8_CUDA:
+ case nbnxnk8x8x8_PlainC:
return FALSE;
- case nbk4x4_PlainC:
- case nbk4xN_X86_SIMD128:
- case nbk4xN_X86_SIMD256:
+ case nbnxnk4x4_PlainC:
+ case nbnxnk4xN_SIMD_4xN:
+ case nbnxnk4xN_SIMD_2xNN:
return TRUE;
default:
snew(nbl->work,1);
#ifdef NBNXN_BBXXXX
- snew_aligned(nbl->work->bb_ci,GPU_NSUBCELL/STRIDE_8BB*NNBSBB_XXXX,16);
+ snew_aligned(nbl->work->bb_ci,GPU_NSUBCELL/STRIDE_8BB*NNBSBB_XXXX,32);
#else
- snew_aligned(nbl->work->bb_ci,GPU_NSUBCELL*NNBSBB_B,16);
-#endif
- snew_aligned(nbl->work->x_ci,NBNXN_NA_SC_MAX*DIM,16);
-#ifdef NBNXN_SEARCH_SSE
- snew_aligned(nbl->work->x_ci_x86_simd128,1,16);
-#ifdef GMX_X86_AVX_256
- snew_aligned(nbl->work->x_ci_x86_simd256,1,32);
+ snew_aligned(nbl->work->bb_ci,GPU_NSUBCELL*NNBSBB_B,32);
#endif
+ snew_aligned(nbl->work->x_ci,NBNXN_NA_SC_MAX*DIM,32);
+#ifdef GMX_NBNXN_SIMD
+ snew_aligned(nbl->work->x_ci_simd_4xn,1,32);
+ snew_aligned(nbl->work->x_ci_simd_2xnn,1,32);
#endif
- snew_aligned(nbl->work->d2,GPU_NSUBCELL,16);
+ snew_aligned(nbl->work->d2,GPU_NSUBCELL,32);
}
void nbnxn_init_pairlist_set(nbnxn_pairlist_set_t *nbl_list,
return (rdiag && ci == cj ? NBNXN_INT_MASK_DIAG : NBNXN_INT_MASK_ALL);
}
-#ifdef NBNXN_SEARCH_SSE
/* Returns a diagonal or off-diagonal interaction mask for SIMD128 lists */
static unsigned int get_imask_x86_simd128(gmx_bool rdiag,int ci,int cj)
{
#endif
}
-#ifdef GMX_X86_AVX_256
/* Returns a diagonal or off-diagonal interaction mask for SIMD256 lists */
static unsigned int get_imask_x86_simd256(gmx_bool rdiag,int ci,int cj)
{
return (rdiag && ci == cj*2 ? NBNXN_INT_MASK_DIAG_J8_0 :
(rdiag && ci == cj*2+1 ? NBNXN_INT_MASK_DIAG_J8_1 :
NBNXN_INT_MASK_ALL));
-#else /* cj-size = 2 */
+#else /* cj-size = 4 */
return (rdiag && ci == cj ? NBNXN_INT_MASK_DIAG : NBNXN_INT_MASK_ALL);
#endif
}
+
+#ifdef GMX_NBNXN_SIMD
+#if GMX_NBNXN_SIMD_BITWIDTH == 128
+#define get_imask_x86_simd_4xn get_imask_x86_simd128
+#else
+#if GMX_NBNXN_SIMD_BITWIDTH == 256
+#define get_imask_x86_simd_4xn get_imask_x86_simd256
+#define get_imask_x86_simd_2xnn get_imask_x86_simd128
+#else
+#error "unsupported GMX_NBNXN_SIMD_BITWIDTH"
+#endif
+#endif
#endif
-#endif /* NBNXN_SEARCH_SSE */
/* Plain C code for making a pair list of cell ci vs cell cjf-cjl.
* Checks bounding box distances and possibly atom pair distances.
}
}
-#ifdef NBNXN_SEARCH_SSE
-/* Include make_cluster_list_x86_simd128/256 */
-#define GMX_MM128_HERE
-#include "gmx_x86_simd_macros.h"
-#define STRIDE_S PACK_X4
-#include "nbnxn_search_x86_simd.h"
-#undef STRIDE_S
-#undef GMX_MM128_HERE
-#ifdef GMX_X86_AVX_256
-/* Include make_cluster_list_x86_simd128/256 */
-#define GMX_MM256_HERE
-#include "gmx_x86_simd_macros.h"
-#define STRIDE_S GMX_X86_SIMD_WIDTH_HERE
-#include "nbnxn_search_x86_simd.h"
-#undef STRIDE_S
-#undef GMX_MM256_HERE
+#ifdef GMX_NBNXN_SIMD_4XN
+#include "nbnxn_search_simd_4xn.h"
#endif
+#ifdef GMX_NBNXN_SIMD_2XNN
+#include "nbnxn_search_simd_2xnn.h"
#endif
/* Plain C or SSE code for making a pair list of super-cell sci vs scj.
switch (nb_kernel_type)
{
- case nbk4x4_PlainC:
+ case nbnxnk4x4_PlainC:
check_subcell_list_space_simple(nbl,cl-cf+1);
make_cluster_list_simple(gridj,
rl2,rbb2,
&ndistc);
break;
-#ifdef NBNXN_SEARCH_SSE
- case nbk4xN_X86_SIMD128:
+#ifdef GMX_NBNXN_SIMD_4XN
+ case nbnxnk4xN_SIMD_4xN:
check_subcell_list_space_simple(nbl,ci_to_cj(na_cj_2log,cl-cf)+2);
- make_cluster_list_x86_simd128(gridj,
- nbl,ci,cf,cl,
- (gridi == gridj && shift == CENTRAL),
- nbat->x,
- rl2,rbb2,
- &ndistc);
+ make_cluster_list_simd_4xn(gridj,
+ nbl,ci,cf,cl,
+ (gridi == gridj && shift == CENTRAL),
+ nbat->x,
+ rl2,rbb2,
+ &ndistc);
break;
-#ifdef GMX_X86_AVX_256
- case nbk4xN_X86_SIMD256:
+#endif
+#ifdef GMX_NBNXN_SIMD_2XNN
+ case nbnxnk4xN_SIMD_2xNN:
check_subcell_list_space_simple(nbl,ci_to_cj(na_cj_2log,cl-cf)+2);
- make_cluster_list_x86_simd256(gridj,
- nbl,ci,cf,cl,
- (gridi == gridj && shift == CENTRAL),
- nbat->x,
- rl2,rbb2,
- &ndistc);
+ make_cluster_list_simd_2xnn(gridj,
+ nbl,ci,cf,cl,
+ (gridi == gridj && shift == CENTRAL),
+ nbat->x,
+ rl2,rbb2,
+ &ndistc);
break;
#endif
-#endif
- case nbk8x8x8_PlainC:
- case nbk8x8x8_CUDA:
+ case nbnxnk8x8x8_PlainC:
+ case nbnxnk8x8x8_CUDA:
check_subcell_list_space_supersub(nbl,cl-cf+1);
for(cj=cf; cj<=cl; cj++)
{
{
switch (nb_kernel_type)
{
-#ifdef NBNXN_SEARCH_SSE
- case nbk4xN_X86_SIMD128:
- nbs->icell_set_x = icell_set_x_x86_simd128;
- break;
-#ifdef GMX_X86_AVX_256
- case nbk4xN_X86_SIMD256:
- nbs->icell_set_x = icell_set_x_x86_simd256;
+#ifdef GMX_NBNXN_SIMD_4XN
+ case nbnxnk4xN_SIMD_4xN:
+ nbs->icell_set_x = icell_set_x_simd_4xn;
break;
#endif
+#ifdef GMX_NBNXN_SIMD_2XNN
+ case nbnxnk4xN_SIMD_2xNN:
+ nbs->icell_set_x = icell_set_x_simd_2xnn;
+ break;
#endif
default:
nbs->icell_set_x = icell_set_x_simple;
--- /dev/null
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2012, The GROMACS development team,
+ * check out http://www.gromacs.org for more information.
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+#if GMX_NBNXN_SIMD_BITWIDTH == 128
+#define GMX_MM128_HERE
+#else
+#if GMX_NBNXN_SIMD_BITWIDTH == 256
+#define GMX_MM256_HERE
+#else
+#error "unsupported GMX_NBNXN_SIMD_BITWIDTH"
+#endif
+#endif
+#include "gmx_simd_macros.h"
+
+#if GMX_SIMD_WIDTH_HERE >= 2*NBNXN_CPU_CLUSTER_I_SIZE
+#define STRIDE_S (GMX_SIMD_WIDTH_HERE/2)
+#else
+#define STRIDE_S NBNXN_CPU_CLUSTER_I_SIZE
+#endif
+
+static gmx_inline gmx_mm_pr gmx_load_hpr_hilo_pr(const real *a)
+{
+ gmx_mm_hpr a_SSE;
+
+ a_SSE = _mm_load_ps(a);
+
+ return gmx_2hpr_to_pr(a_SSE,a_SSE);
+}
+
+static gmx_inline gmx_mm_pr gmx_set_2real_shift_pr(const real *a,real shift)
+{
+ gmx_mm_hpr a0,a1;
+
+ a0 = _mm_set1_ps(a[0] + shift);
+ a1 = _mm_set1_ps(a[1] + shift);
+
+ return gmx_2hpr_to_pr(a1,a0);
+}
+
+/* Copies PBC shifted i-cell packed atom coordinates to working array */
+static gmx_inline void
+icell_set_x_simd_2xnn(int ci,
+ real shx,real shy,real shz,
+ int na_c,
+ int stride,const real *x,
+ nbnxn_list_work_t *work)
+{
+ int ia;
+ nbnxn_x_ci_simd_2xnn_t *x_ci;
+
+ x_ci = work->x_ci_simd_2xnn;
+
+ ia = X_IND_CI_SIMD_2XNN(ci);
+
+ x_ci->ix_SSE0 = gmx_set_2real_shift_pr(x + ia + 0*STRIDE_S + 0, shx);
+ x_ci->iy_SSE0 = gmx_set_2real_shift_pr(x + ia + 1*STRIDE_S + 0, shy);
+ x_ci->iz_SSE0 = gmx_set_2real_shift_pr(x + ia + 2*STRIDE_S + 0, shz);
+ x_ci->ix_SSE2 = gmx_set_2real_shift_pr(x + ia + 0*STRIDE_S + 2, shx);
+ x_ci->iy_SSE2 = gmx_set_2real_shift_pr(x + ia + 1*STRIDE_S + 2, shy);
+ x_ci->iz_SSE2 = gmx_set_2real_shift_pr(x + ia + 2*STRIDE_S + 2, shz);
+}
+
+/* SIMD code for making a pair list of cell ci vs cell cjf-cjl
+ * for coordinates in packed format.
+ * Checks bouding box distances and possibly atom pair distances.
+ * This is an accelerated version of make_cluster_list_simple.
+ */
+static gmx_inline void
+make_cluster_list_simd_2xnn(const nbnxn_grid_t *gridj,
+ nbnxn_pairlist_t *nbl,
+ int ci,int cjf,int cjl,
+ gmx_bool remove_sub_diag,
+ const real *x_j,
+ real rl2,float rbb2,
+ int *ndistc)
+{
+ const nbnxn_x_ci_simd_2xnn_t *work;
+ const float *bb_ci;
+
+ gmx_mm_pr jx_SSE,jy_SSE,jz_SSE;
+
+ gmx_mm_pr dx_SSE0,dy_SSE0,dz_SSE0;
+ gmx_mm_pr dx_SSE2,dy_SSE2,dz_SSE2;
+
+ gmx_mm_pr rsq_SSE0;
+ gmx_mm_pr rsq_SSE2;
+
+ gmx_mm_pr wco_SSE0;
+ gmx_mm_pr wco_SSE2;
+ gmx_mm_pr wco_any_SSE;
+
+ gmx_mm_pr rc2_SSE;
+
+ gmx_bool InRange;
+ float d2;
+ int xind_f,xind_l,cj;
+
+ cjf = CI_TO_CJ_SIMD_2XNN(cjf);
+ cjl = CI_TO_CJ_SIMD_2XNN(cjl+1) - 1;
+
+ work = nbl->work->x_ci_simd_2xnn;
+
+ bb_ci = nbl->work->bb_ci;
+
+ rc2_SSE = gmx_set1_pr(rl2);
+
+ InRange = FALSE;
+ while (!InRange && cjf <= cjl)
+ {
+ d2 = subc_bb_dist2_sse(4,0,bb_ci,cjf,gridj->bbj);
+ *ndistc += 2;
+
+ /* Check if the distance is within the distance where
+ * we use only the bounding box distance rbb,
+ * or within the cut-off and there is at least one atom pair
+ * within the cut-off.
+ */
+ if (d2 < rbb2)
+ {
+ InRange = TRUE;
+ }
+ else if (d2 < rl2)
+ {
+ xind_f = X_IND_CJ_SIMD_2XNN(CI_TO_CJ_SIMD_2XNN(gridj->cell0) + cjf);
+
+ jx_SSE = gmx_load_hpr_hilo_pr(x_j+xind_f+0*STRIDE_S);
+ jy_SSE = gmx_load_hpr_hilo_pr(x_j+xind_f+1*STRIDE_S);
+ jz_SSE = gmx_load_hpr_hilo_pr(x_j+xind_f+2*STRIDE_S);
+
+ /* Calculate distance */
+ dx_SSE0 = gmx_sub_pr(work->ix_SSE0,jx_SSE);
+ dy_SSE0 = gmx_sub_pr(work->iy_SSE0,jy_SSE);
+ dz_SSE0 = gmx_sub_pr(work->iz_SSE0,jz_SSE);
+ dx_SSE2 = gmx_sub_pr(work->ix_SSE2,jx_SSE);
+ dy_SSE2 = gmx_sub_pr(work->iy_SSE2,jy_SSE);
+ dz_SSE2 = gmx_sub_pr(work->iz_SSE2,jz_SSE);
+
+ /* rsq = dx*dx+dy*dy+dz*dz */
+ rsq_SSE0 = gmx_calc_rsq_pr(dx_SSE0,dy_SSE0,dz_SSE0);
+ rsq_SSE2 = gmx_calc_rsq_pr(dx_SSE2,dy_SSE2,dz_SSE2);
+
+ wco_SSE0 = gmx_cmplt_pr(rsq_SSE0,rc2_SSE);
+ wco_SSE2 = gmx_cmplt_pr(rsq_SSE2,rc2_SSE);
+
+ wco_any_SSE = gmx_or_pr(wco_SSE0,wco_SSE2);
+
+ InRange = gmx_movemask_pr(wco_any_SSE);
+
+ *ndistc += 2*GMX_SIMD_WIDTH_HERE;
+ }
+ if (!InRange)
+ {
+ cjf++;
+ }
+ }
+ if (!InRange)
+ {
+ return;
+ }
+
+ InRange = FALSE;
+ while (!InRange && cjl > cjf)
+ {
+ d2 = subc_bb_dist2_sse(4,0,bb_ci,cjl,gridj->bbj);
+ *ndistc += 2;
+
+ /* Check if the distance is within the distance where
+ * we use only the bounding box distance rbb,
+ * or within the cut-off and there is at least one atom pair
+ * within the cut-off.
+ */
+ if (d2 < rbb2)
+ {
+ InRange = TRUE;
+ }
+ else if (d2 < rl2)
+ {
+ xind_l = X_IND_CJ_SIMD_2XNN(CI_TO_CJ_SIMD_2XNN(gridj->cell0) + cjl);
+
+ jx_SSE = gmx_load_hpr_hilo_pr(x_j+xind_l+0*STRIDE_S);
+ jy_SSE = gmx_load_hpr_hilo_pr(x_j+xind_l+1*STRIDE_S);
+ jz_SSE = gmx_load_hpr_hilo_pr(x_j+xind_l+2*STRIDE_S);
+
+ /* Calculate distance */
+ dx_SSE0 = gmx_sub_pr(work->ix_SSE0,jx_SSE);
+ dy_SSE0 = gmx_sub_pr(work->iy_SSE0,jy_SSE);
+ dz_SSE0 = gmx_sub_pr(work->iz_SSE0,jz_SSE);
+ dx_SSE2 = gmx_sub_pr(work->ix_SSE2,jx_SSE);
+ dy_SSE2 = gmx_sub_pr(work->iy_SSE2,jy_SSE);
+ dz_SSE2 = gmx_sub_pr(work->iz_SSE2,jz_SSE);
+
+ /* rsq = dx*dx+dy*dy+dz*dz */
+ rsq_SSE0 = gmx_calc_rsq_pr(dx_SSE0,dy_SSE0,dz_SSE0);
+ rsq_SSE2 = gmx_calc_rsq_pr(dx_SSE2,dy_SSE2,dz_SSE2);
+
+ wco_SSE0 = gmx_cmplt_pr(rsq_SSE0,rc2_SSE);
+ wco_SSE2 = gmx_cmplt_pr(rsq_SSE2,rc2_SSE);
+
+ wco_any_SSE = gmx_or_pr(wco_SSE0,wco_SSE2);
+
+ InRange = gmx_movemask_pr(wco_any_SSE);
+
+ *ndistc += 2*GMX_SIMD_WIDTH_HERE;
+ }
+ if (!InRange)
+ {
+ cjl--;
+ }
+ }
+
+ if (cjf <= cjl)
+ {
+ for(cj=cjf; cj<=cjl; cj++)
+ {
+ /* Store cj and the interaction mask */
+ nbl->cj[nbl->ncj].cj = CI_TO_CJ_SIMD_2XNN(gridj->cell0) + cj;
+ nbl->cj[nbl->ncj].excl = get_imask_x86_simd_2xnn(remove_sub_diag,ci,cj);
+ nbl->ncj++;
+ }
+ /* Increase the closing index in i super-cell list */
+ nbl->ci[nbl->nci].cj_ind_end = nbl->ncj;
+ }
+}
+
+#undef STRIDE_S
+#undef GMX_MM128_HERE
+#undef GMX_MM256_HERE
* the research papers on the package. Check out http://www.gromacs.org.
*/
-/* GMX_MM128_HERE or GMX_MM256_HERE should be set before including this file.
- * gmx_sse_or_avh.h should be included before including this file.
- */
-
-/* Copies PBC shifted i-cell packed atom coordinates to working array */
-#ifdef GMX_MM128_HERE
-static void icell_set_x_x86_simd128
+#if GMX_NBNXN_SIMD_BITWIDTH == 128
+#define GMX_MM128_HERE
#else
-#ifdef GMX_MM256_HERE
-static void icell_set_x_x86_simd256
+#if GMX_NBNXN_SIMD_BITWIDTH == 256
+#define GMX_MM256_HERE
#else
-"error: GMX_MM128_HERE or GMX_MM256_HERE not defined"
+#error "unsupported GMX_NBNXN_SIMD_BITWIDTH"
#endif
#endif
- (int ci,
- real shx,real shy,real shz,
- int na_c,
- int stride,const real *x,
- nbnxn_list_work_t *work)
-{
- int ia;
-#ifdef GMX_MM128_HERE
- nbnxn_x_ci_x86_simd128_t *x_ci;
-
- x_ci = work->x_ci_x86_simd128;
+#include "gmx_simd_macros.h"
- ia = X_IND_CI_S128(ci);
+#if GMX_SIMD_WIDTH_HERE >= NBNXN_CPU_CLUSTER_I_SIZE
+#define STRIDE_S (GMX_SIMD_WIDTH_HERE)
#else
- nbnxn_x_ci_x86_simd256_t *x_ci;
+#define STRIDE_S NBNXN_CPU_CLUSTER_I_SIZE
+#endif
+
+/* Copies PBC shifted i-cell packed atom coordinates to working array */
+static gmx_inline void
+icell_set_x_simd_4xn(int ci,
+ real shx,real shy,real shz,
+ int na_c,
+ int stride,const real *x,
+ nbnxn_list_work_t *work)
+{
+ int ia;
+ nbnxn_x_ci_simd_4xn_t *x_ci;
- x_ci = work->x_ci_x86_simd256;
+ x_ci = work->x_ci_simd_4xn;
- ia = X_IND_CI_S256(ci);
-#endif
+ ia = X_IND_CI_SIMD_4XN(ci);
x_ci->ix_SSE0 = gmx_set1_pr(x[ia + 0*STRIDE_S ] + shx);
x_ci->iy_SSE0 = gmx_set1_pr(x[ia + 1*STRIDE_S ] + shy);
x_ci->iz_SSE3 = gmx_set1_pr(x[ia + 2*STRIDE_S + 3] + shz);
}
-/* SSE or AVX code for making a pair list of cell ci vs cell cjf-cjl
+/* SIMD code for making a pair list of cell ci vs cell cjf-cjl
* for coordinates in packed format.
* Checks bouding box distances and possibly atom pair distances.
* This is an accelerated version of make_cluster_list_simple.
*/
-#ifdef GMX_MM128_HERE
-static void make_cluster_list_x86_simd128
-#else
-#ifdef GMX_MM256_HERE
-static void make_cluster_list_x86_simd256
-#else
-"error: GMX_MM128_HERE or GMX_MM256_HERE not defined"
-#endif
-#endif
- (const nbnxn_grid_t *gridj,
- nbnxn_pairlist_t *nbl,
- int ci,int cjf,int cjl,
- gmx_bool remove_sub_diag,
- const real *x_j,
- real rl2,float rbb2,
- int *ndistc)
+static gmx_inline void
+make_cluster_list_simd_4xn(const nbnxn_grid_t *gridj,
+ nbnxn_pairlist_t *nbl,
+ int ci,int cjf,int cjl,
+ gmx_bool remove_sub_diag,
+ const real *x_j,
+ real rl2,float rbb2,
+ int *ndistc)
{
-#ifdef GMX_MM128_HERE
- const nbnxn_x_ci_x86_simd128_t *work;
-#else
- const nbnxn_x_ci_x86_simd256_t *work;
-#endif
-
+ const nbnxn_x_ci_simd_4xn_t *work;
const float *bb_ci;
gmx_mm_pr jx_SSE,jy_SSE,jz_SSE;
float d2;
int xind_f,xind_l,cj;
-#ifdef GMX_MM128_HERE
- cjf = CI_TO_CJ_S128(cjf);
- cjl = CI_TO_CJ_S128(cjl+1) - 1;
-
- work = nbl->work->x_ci_x86_simd128;
-#else
- cjf = CI_TO_CJ_S256(cjf);
- cjl = CI_TO_CJ_S256(cjl+1) - 1;
+ cjf = CI_TO_CJ_SIMD_4XN(cjf);
+ cjl = CI_TO_CJ_SIMD_4XN(cjl+1) - 1;
- work = nbl->work->x_ci_x86_simd256;
-#endif
+ work = nbl->work->x_ci_simd_4xn;
bb_ci = nbl->work->bb_ci;
}
else if (d2 < rl2)
{
-#ifdef GMX_MM128_HERE
- xind_f = X_IND_CJ_S128(CI_TO_CJ_S128(gridj->cell0) + cjf);
-#else
- xind_f = X_IND_CJ_S256(CI_TO_CJ_S256(gridj->cell0) + cjf);
-#endif
+ xind_f = X_IND_CJ_SIMD_4XN(CI_TO_CJ_SIMD_4XN(gridj->cell0) + cjf);
+
jx_SSE = gmx_load_pr(x_j+xind_f+0*STRIDE_S);
jy_SSE = gmx_load_pr(x_j+xind_f+1*STRIDE_S);
jz_SSE = gmx_load_pr(x_j+xind_f+2*STRIDE_S);
InRange = gmx_movemask_pr(wco_any_SSE);
- *ndistc += 4*GMX_X86_SIMD_WIDTH_HERE;
+ *ndistc += 4*GMX_SIMD_WIDTH_HERE;
}
if (!InRange)
{
}
else if (d2 < rl2)
{
-#ifdef GMX_MM128_HERE
- xind_l = X_IND_CJ_S128(CI_TO_CJ_S128(gridj->cell0) + cjl);
-#else
- xind_l = X_IND_CJ_S256(CI_TO_CJ_S256(gridj->cell0) + cjl);
-#endif
+ xind_l = X_IND_CJ_SIMD_4XN(CI_TO_CJ_SIMD_4XN(gridj->cell0) + cjl);
+
jx_SSE = gmx_load_pr(x_j+xind_l+0*STRIDE_S);
jy_SSE = gmx_load_pr(x_j+xind_l+1*STRIDE_S);
jz_SSE = gmx_load_pr(x_j+xind_l+2*STRIDE_S);
InRange = gmx_movemask_pr(wco_any_SSE);
- *ndistc += 4*GMX_X86_SIMD_WIDTH_HERE;
+ *ndistc += 4*GMX_SIMD_WIDTH_HERE;
}
if (!InRange)
{
for(cj=cjf; cj<=cjl; cj++)
{
/* Store cj and the interaction mask */
-#ifdef GMX_MM128_HERE
- nbl->cj[nbl->ncj].cj = CI_TO_CJ_S128(gridj->cell0) + cj;
- nbl->cj[nbl->ncj].excl = get_imask_x86_simd128(remove_sub_diag,ci,cj);
-#else
- nbl->cj[nbl->ncj].cj = CI_TO_CJ_S256(gridj->cell0) + cj;
- nbl->cj[nbl->ncj].excl = get_imask_x86_simd256(remove_sub_diag,ci,cj);
-#endif
+ nbl->cj[nbl->ncj].cj = CI_TO_CJ_SIMD_4XN(gridj->cell0) + cj;
+ nbl->cj[nbl->ncj].excl = get_imask_x86_simd_4xn(remove_sub_diag,ci,cj);
nbl->ncj++;
}
/* Increase the closing index in i super-cell list */
nbl->ci[nbl->nci].cj_ind_end = nbl->ncj;
}
}
+
+#undef STRIDE_S
+#undef GMX_MM128_HERE
+#undef GMX_MM256_HERE
#include "nbnxn_atomdata.h"
#include "nbnxn_search.h"
#include "nbnxn_kernels/nbnxn_kernel_ref.h"
-#include "nbnxn_kernels/nbnxn_kernel_x86_simd128.h"
-#include "nbnxn_kernels/nbnxn_kernel_x86_simd256.h"
+#include "nbnxn_kernels/nbnxn_kernel_simd_4xn.h"
+#include "nbnxn_kernels/nbnxn_kernel_simd_2xnn.h"
#include "nbnxn_kernels/nbnxn_kernel_gpu_ref.h"
#ifdef GMX_LIB_MPI
gmx_incons("Invalid cut-off scheme passed!");
}
- if (nbvg->kernel_type != nbk8x8x8_CUDA)
+ if (nbvg->kernel_type != nbnxnk8x8x8_CUDA)
{
wallcycle_sub_start(wcycle, ewcsNONBONDED);
}
switch (nbvg->kernel_type)
{
- case nbk4x4_PlainC:
+ case nbnxnk4x4_PlainC:
nbnxn_kernel_ref(&nbvg->nbl_lists,
nbvg->nbat, ic,
fr->shift_vec,
enerd->grpp.ener[egLJSR]);
break;
- case nbk4xN_X86_SIMD128:
- nbnxn_kernel_x86_simd128(&nbvg->nbl_lists,
- nbvg->nbat, ic,
- nbvg->ewald_excl,
- fr->shift_vec,
- flags,
- clearF,
- fr->fshift[0],
- enerd->grpp.ener[egCOULSR],
- fr->bBHAM ?
- enerd->grpp.ener[egBHAMSR] :
- enerd->grpp.ener[egLJSR]);
+ case nbnxnk4xN_SIMD_4xN:
+ nbnxn_kernel_simd_4xn(&nbvg->nbl_lists,
+ nbvg->nbat, ic,
+ nbvg->ewald_excl,
+ fr->shift_vec,
+ flags,
+ clearF,
+ fr->fshift[0],
+ enerd->grpp.ener[egCOULSR],
+ fr->bBHAM ?
+ enerd->grpp.ener[egBHAMSR] :
+ enerd->grpp.ener[egLJSR]);
break;
- case nbk4xN_X86_SIMD256:
- nbnxn_kernel_x86_simd256(&nbvg->nbl_lists,
- nbvg->nbat, ic,
- nbvg->ewald_excl,
- fr->shift_vec,
- flags,
- clearF,
- fr->fshift[0],
- enerd->grpp.ener[egCOULSR],
- fr->bBHAM ?
- enerd->grpp.ener[egBHAMSR] :
- enerd->grpp.ener[egLJSR]);
+ case nbnxnk4xN_SIMD_2xNN:
+ nbnxn_kernel_simd_2xnn(&nbvg->nbl_lists,
+ nbvg->nbat, ic,
+ nbvg->ewald_excl,
+ fr->shift_vec,
+ flags,
+ clearF,
+ fr->fshift[0],
+ enerd->grpp.ener[egCOULSR],
+ fr->bBHAM ?
+ enerd->grpp.ener[egBHAMSR] :
+ enerd->grpp.ener[egLJSR]);
break;
- case nbk8x8x8_CUDA:
+ case nbnxnk8x8x8_CUDA:
nbnxn_cuda_launch_kernel(fr->nbv->cu_nbv, nbvg->nbat, flags, ilocality);
break;
- case nbk8x8x8_PlainC:
+ case nbnxnk8x8x8_PlainC:
nbnxn_kernel_gpu_ref(nbvg->nbl_lists.nbl[0],
nbvg->nbat, ic,
fr->shift_vec,
gmx_incons("Invalid nonbonded kernel type passed!");
}
- if (nbvg->kernel_type != nbk8x8x8_CUDA)
+ if (nbvg->kernel_type != nbnxnk8x8x8_CUDA)
{
wallcycle_sub_stop(wcycle, ewcsNONBONDED);
}
bDoForces = (flags & GMX_FORCE_FORCES);
bSepLRF = (bDoLongRange && bDoForces && (flags & GMX_FORCE_SEPLRF));
bUseGPU = fr->nbv->bUseGPU;
- bUseOrEmulGPU = bUseGPU || (nbv->grp[0].kernel_type == nbk8x8x8_PlainC);
+ bUseOrEmulGPU = bUseGPU || (nbv->grp[0].kernel_type == nbnxnk8x8x8_PlainC);
if (bStateChanged)
{
wallcycle_sub_stop(wcycle,ewcsNBS_SEARCH_NONLOCAL);
- if (nbv->grp[eintNonlocal].kernel_type == nbk8x8x8_CUDA)
+ if (nbv->grp[eintNonlocal].kernel_type == nbnxnk8x8x8_CUDA)
{
/* initialize non-local pair-list on the GPU */
nbnxn_cuda_init_pairlist(nbv->cu_nbv,