set(CPACK_COMPONENT_GROUP_TOOLS_DESCRIPTION "All GROMACS executable tools")
set(CPACK_COMPONENT_GROUP_MDRUN_DESCRIPTION "GROMACS executable for running simulations")
-# override bugs on OS X where Cmake picks gcc (GNU) for C instead of system default cc (Clang).
-if(APPLE)
- set(CMAKE_C_COMPILER_INIT "cc")
-endif(APPLE)
+# CMake modules/macros are in a subdirectory to keep this file cleaner
+# This needs to be set before project() in order to pick up toolchain files
+list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake ${CMAKE_CURRENT_SOURCE_DIR}/cmake/Platform)
project(Gromacs C)
include(Dart)
# machine with no git.
#
# NOTE: when releasing the "-dev" suffix needs to be stripped off!
-set(PROJECT_VERSION "4.6-beta2-dev")
+set(PROJECT_VERSION "4.6-beta3-dev")
set(CUSTOM_VERSION_STRING ""
CACHE STRING "Custom version string (if empty, use hard-coded default)")
mark_as_advanced(CUSTOM_VERSION_STRING)
# provide backward compatibility of software written against the Gromacs API.
set(API_VERSION ${NUM_VERSION})
-# Cmake modules/macros are in a subdirectory to keep this file cleaner
-set(CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake)
-
if(CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT AND UNIX)
set(CMAKE_INSTALL_PREFIX "/usr/local/gromacs" CACHE STRING "Installation prefix (installation will need write permissions here)" FORCE)
endif()
option(GMX_FORCE_CXX "Enable C++ compilation even if not necessary" OFF)
mark_as_advanced(GMX_FORCE_CXX)
+option(GMX_NO_QUOTES "Disable Gromacs cool quotes" OFF)
+
if(GMX_GPU OR GMX_OPENMM OR GMX_FORCE_CXX)
enable_language(CXX)
endif()
SET(SHARED_LIBS_DEFAULT OFF)
else()
add_definitions(-DUSE_VISIBILITY -DTMPI_USE_VISIBILITY)
+ set(PKG_CFLAGS "$PKG_CFLAGS -DUSE_VISIBILITY -DTMPI_USE_VISIBILITY")
endif()
IF (GMX_PREFER_STATIC_LIBS)
#Workaround for cmake bug 13174. Replace deprecated options.
IF( CMAKE_C_COMPILER_ID MATCHES "Intel" )
+ if(BUILD_SHARED_LIBS)
+ STRING(REPLACE "/INCREMENTAL:YES" "" CMAKE_SHARED_LINKER_FLAGS ${CMAKE_SHARED_LINKER_FLAGS})
+ SET(CMAKE_SHARED_LINKER_FLAGS ${CMAKE_SHARED_LINKER_FLAGS} CACHE STRING "" FORCE)
+ endif()
STRING(REPLACE /GZ /RTC1 CMAKE_C_FLAGS_DEBUG ${CMAKE_C_FLAGS_DEBUG})
SET(CMAKE_C_FLAGS_DEBUG ${CMAKE_C_FLAGS_DEBUG} CACHE STRING "" FORCE)
ENDIF()
option(GMX_THREAD_MPI "Build a thread-MPI-based multithreaded version of GROMACS (not compatible with MPI)" ON)
option(GMX_SOFTWARE_INVSQRT "Use GROMACS software 1/sqrt" ON)
mark_as_advanced(GMX_SOFTWARE_INVSQRT)
-option(GMX_POWERPC_INVSQRT "Use PowerPC hardware 1/sqrt" OFF)
-mark_as_advanced(GMX_POWERPC_INVSQRT)
option(GMX_FAHCORE "Build a library with mdrun functionality" OFF)
mark_as_advanced(GMX_FAHCORE)
endif(NOT DEFINED GMX_CPU_ACCELERATION)
set(GMX_CPU_ACCELERATION "@GMX_SUGGESTED_CPU_ACCELERATION@"
- CACHE STRING "Accelerated CPU kernels. Pick one of: None, SSE2, SSE4.1, AVX_128_FMA, AVX_256, BlueGene, Power6, Fortran")
+ CACHE STRING "Accelerated CPU kernels. Pick one of: None, SSE2, SSE4.1, AVX_128_FMA, AVX_256, BlueGene")
set(GMX_FFT_LIBRARY "fftw3"
CACHE STRING "FFT library choices: fftw3,mkl,fftpack[built-in]")
if(GMX_SOFTWARE_INVSQRT)
set(PKG_CFLAGS "${PKG_CFLAGS} -DGMX_SOFTWARE_INVSQRT")
endif(GMX_SOFTWARE_INVSQRT)
-if(GMX_POWERPC_INVSQRT)
- set(PKG_CFLAGS "${PKG_CFLAGS} -DGMX_POWERPC_INVSQRT")
-endif(GMX_POWERPC_INVSQRT)
########################################################################
#Process MPI settings
endif()
endif()
-elseif(${GMX_CPU_ACCELERATION} STREQUAL "FORTRAN")
-
-# Fortran is temporarily disabled while we push in nbNxN kernels.
-# We need to fake it a bit here to avoid jenkins build errors!
-# add_definitions(-DGMX_FORTRAN)
-
elseif(${GMX_CPU_ACCELERATION} STREQUAL "BLUEGENE")
# GMX_CPU_ACCELERATION=BlueGene should be set in the Toolchain-BlueGene?-???.cmake file
if (NOT ACCELERATION_QUIETLY)
set(BUILD_SHARED_LIBS OFF CACHE BOOL "Shared libraries not compatible with BlueGene/L, disabled!" FORCE)
endif (${CMAKE_SYSTEM_NAME} STREQUAL "BlueGeneL")
set(GMX_SOFTWARE_INVSQRT OFF CACHE BOOL "Do not use software reciprocal square root on BlueGene" FORCE)
- set(GMX_POWERPC_INVSQRT ON CACHE BOOL "Use hardware reciprocal square root on BlueGene" FORCE)
set(GMX_X11 OFF CACHE BOOL "X11 not compatible with BlueGene, disabled!" FORCE)
set(GMX_THREAD_MPI OFF CACHE BOOL "Thread-MPI not compatible with BlueGene, disabled!" FORCE)
set(GMX_MPI ON CACHE BOOL "Use MPI on BlueGene" FORCE)
# The automatic testing for endianness does not work for the BlueGene cross-compiler
set(GMX_IEEE754_BIG_ENDIAN_BYTE_ORDER 1 CACHE INTERNAL "BlueGene has big endian FP byte order (by default)" FORCE)
set(GMX_IEEE754_BIG_ENDIAN_WORD_ORDER 1 CACHE INTERNAL "BlueGene has big endian FP word order (by default)" FORCE)
-elseif(${GMX_CPU_ACCELERATION} STREQUAL "POWER6")
- set(GMX_POWER6 1)
- set(GMX_SOFTWARE_INVSQRT OFF CACHE BOOL "Do not use software reciprocal square root on Power6" FORCE)
- set(GMX_POWERPC_INVSQRT ON CACHE BOOL "Use hardware reciprocal square root on Power6" FORCE)
else(${GMX_CPU_ACCELERATION} STREQUAL "NONE")
- MESSAGE(FATAL_ERROR "Unrecognized option for accelerated kernels: ${GMX_CPU_ACCELERATION}. Pick one of None, SSE2, SSE4.1, AVX_128_FMA, AVX_256, Fortran, BlueGene, Power6")
+ MESSAGE(FATAL_ERROR "Unrecognized option for accelerated kernels: ${GMX_CPU_ACCELERATION}. Pick one of None, SSE2, SSE4.1, AVX_128_FMA, AVX_256, BlueGene")
endif(${GMX_CPU_ACCELERATION} STREQUAL "NONE")
set(ACCELERATION_QUIETLY TRUE CACHE INTERNAL "")
-if(GMX_FORTRAN OR GMX_POWER6)
- if (GMX_THREAD_MPI)
- message(FATAL_ERROR "FORTRAN/POWER6 is incompatible with thread-MPI and only provides a speed-up on certain IBM compilers. Disable FORTRAN (or threads if you really want to use FORTRAN kernels).")
- endif(GMX_THREAD_MPI)
- enable_language(Fortran)
- include(FortranCInterface)
- discover_fortran_mangling(prefix isupper suffix extra_under_score found)
- if(extra_under_score)
- set(extrasuffix "_")
- endif(extra_under_score)
- if(prefix)
- set(prefix "${prefix} ##")
- endif(prefix)
- if(suffix)
- set(suffix "## ${suffix}")
- if(extrasuffix)
- set(extrasuffix "${suffix}${extrasuffix}")
- endif(extrasuffix)
- else(suffix)
- if(extrasuffix)
- # Don't know if this is needed, but it can't hurt
- set(extrasuffix "## ${extrasuffix}")
- endif(extrasuffix)
- endif(suffix)
-
- if(isupper)
- set(F77_FUNCDEF "${prefix} NAME ${suffix}")
- set(F77_FUNCDEF_ "${prefix} NAME ${extrasuffix}")
- else(isupper)
- set(F77_FUNCDEF "${prefix} name ${suffix}")
- set(F77_FUNCDEF_ "${prefix} name ${extrasuffix}")
- endif(isupper)
-else(GMX_FORTRAN OR GMX_POWER6)
- set(F77_FUNCDEF "name ## _")
- set(F77_FUNCDEF_ "name ## _")
-endif(GMX_FORTRAN OR GMX_POWER6)
-
# Process QM/MM Settings
string(TOUPPER ${GMX_QMMM_PROGRAM} ${GMX_QMMM_PROGRAM})
if(${GMX_QMMM_PROGRAM} STREQUAL "GAUSSIAN")
unset(OpenMP_LINKER_FLAGS CACHE)
unset(OpenMP_SHARED_LINKER_FLAGS)
endif()
+set(PKG_CFLAGS "${PKG_CFLAGS} ${OpenMP_C_FLAGS}")
######################################
# Output compiler and CFLAGS used
endif (${FFTW}_FOUND)
set(${FFTW}_HAVE_SIMD FALSE CACHE BOOL "If ${${FFTW}_PKG} was built with SIMD support")
-mark_as_advanced(${FFTW}_INCLUDE_DIR ${FFTW}_LIBRARY ${FFTW}_HAVE_SIMD)
+mark_as_advanced(${FFTW}_INCLUDE_DIR ${FFTW}_LIBRARY ${FFTW}_HAVE_SIMD ${FFTW}_HAVE_AVX)
set(GMX_CPU_ACCELERATION "None" CACHE STRING "Disabled for regressiontests reference builds" FORCE)
set(GMX_FFT_LIBRARY "fftpack" CACHE STRING "Use fftpack for regressiontests reference builds" FORCE)
set(GMX_SOFTWARE_INVSQRT OFF CACHE BOOL "Disabled for regressiontests reference builds" FORCE)
- set(GMX_THREAD_MPI OFF OFF CACHE BOOL "Disabled for regressiontests reference builds" FORCE)
+ set(GMX_THREAD_MPI OFF CACHE BOOL "Disabled for regressiontests reference builds" FORCE)
+ # C_COMPILER_VERSION is not defined automatically for CMake below 2.8.8,
+ # so we call the GROMACS work-around for that
+ include(gmxGetCompilerInfo)
+ get_compiler_version()
if(NOT "${CMAKE_C_COMPILER_ID}" STREQUAL "GNU" OR NOT "${C_COMPILER_VERSION}" MATCHES "4.7")
message(WARNING "Reference values for regressiontests should use Gromacs compiled with "
- "gcc-4.7, but your configuration is using ${CMAKE_C_COMPILER_ID}-${C_COMPILER_VERSION}.")
+ "gcc 4.7, but your configuration is using ${CMAKE_C_COMPILER_ID}-${C_COMPILER_VERSION}.")
endif()
endif()
+++ /dev/null
-#
-# This file is part of the GROMACS molecular simulation package.
-#
-# Copyright (c) 2012, by the GROMACS development team, led by
-# David van der Spoel, Berk Hess, Erik Lindahl, and including many
-# others, as listed in the AUTHORS file in the top-level source
-# directory and at http://www.gromacs.org.
-#
-# GROMACS is free software; you can redistribute it and/or
-# modify it under the terms of the GNU Lesser General Public License
-# as published by the Free Software Foundation; either version 2.1
-# of the License, or (at your option) any later version.
-#
-# GROMACS is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-# Lesser General Public License for more details.
-#
-# You should have received a copy of the GNU Lesser General Public
-# License along with GROMACS; if not, see
-# http://www.gnu.org/licenses, or write to the Free Software Foundation,
-# Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-#
-# If you want to redistribute modifications to GROMACS, please
-# consider that scientific software is very special. Version
-# control is crucial - bugs must be traceable. We will be happy to
-# consider code for inclusion in the official distribution, but
-# derived work must not be called official GROMACS. Details are found
-# in the README & COPYING files - if they are missing, get the
-# official version at http://www.gromacs.org.
-#
-# To help us fund GROMACS development, we humbly ask that you cite
-# the research papers on the package. Check out http://www.gromacs.org.
-#
-# Check GCC version and if any of the 4.1.x family compiler suites is found
-# quit the build system generating process.
-#
-# The GCC 4.1.x compilers contain an optimization related bug which might
-# results in code that exhibits incorrect behaviour and often leads to
-# exploding systems or crashes.
-#
-# For further details see e.g.
-# https://bugs.launchpad.net/ubuntu/+source/gcc-4.1/+bug/158799
-#
-# Szilard Pall (pszilard@cbr.su.se)
-#
-
-if(NOT GMX_DISABLE_GCC41_CHECK)
-
-if(CMAKE_COMPILER_IS_GNUCC)
- # if we have -dumpversion flag use that, otherwise try the --version
- execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion
- RESULT_VARIABLE _gcc_dumpversion_res
- OUTPUT_VARIABLE _gcc_dumpversion_out
- OUTPUT_STRIP_TRAILING_WHITESPACE)
- # if gcc returned with error the -dumpversion is not available
- if(${_gcc_dumpversion_res} EQUAL 0)
- if(${_gcc_dumpversion_out} MATCHES ".*4\\.1\\.[0-9]+.*")
- message(FATAL_ERROR " The GCC compiler in use seems to belong to the 4.1.x
- family (detected version: ${_gcc_dumpversion_out}). These compilers
- contain an optimization related bug which might results in code that
- exhibits incorrect behaviour and often leads to exploding systems or
- crashes. To disable this check set GMX_DISABLE_GCC41_CHECK=YES.")
- endif()
- else()
- message(WARNING " The GCC compiler in use does not support the -dumpversion flag.
- Will attempt parsing the version from the \"gcc --version\" output.")
- execute_process(COMMAND ${CMAKE_C_COMPILER} --version
- OUTPUT_VARIABLE _gcc_version_out
- OUTPUT_STRIP_TRAILING_WHITESPACE)
- if("${_gcc_version_out}" MATCHES ".*4\\.1\\.[0-9]+.*")
- message(FATAL_ERROR " The GCC compiler in use seems to belong to the 4.1.x
- family. These compiler compilers contain an optimization related bug
- which might results in code that exhibits incorrect behaviour and
- often leads to exploding systems or crashes. To disable this check set
- GMX_DISABLE_GCC41_CHECK=YES.")
- endif()
- endif()
-endif()
-
-endif()
};
static const char *
-GPLText[] = {
+LicenseText[] = {
"This program is free software; you can redistribute it and/or",
- "modify it under the terms of the GNU General Public License",
- "as published by the Free Software Foundation; either version 2",
+ "modify it under the terms of the GNU Lesser General Public License",
+ "as published by the Free Software Foundation; either version 2.1",
"of the License, or (at your option) any later version."
};
* and 3 that no shifts could be applied. Negative numbers
* correspond to errors in the arguments provided.
*/
+GMX_LIBGMX_EXPORT
void
F77_FUNC(dsaupd,DSAUPD)(int * ido,
const char * bmat,
* \param lworkl Provide the same argument as you did to dsaupd()
* \param info Provide the same argument as you did to dsaupd()
*/
+GMX_LIBGMX_EXPORT
void
F77_FUNC(dseupd,DSEUPD)(int * rvec,
const char * howmny,
*/
#ifndef GMX_CPUID_H_
#define GMX_CPUID_H_
+
+#include <stdio.h>
+
#include "visibility.h"
+
#ifdef __cplusplus
extern "C" {
#endif
F77_FUNC(dsytrd,DSYTRD)(const char *uplo, int *n, double * a, int *lda, double *d,
double *e, double *tau, double *work, int *lwork, int *info);
+GMX_LIBGMX_EXPORT
void
F77_FUNC(dsyevr,DSYEVR)(const char *jobz, const char *range, const char *uplo, int *n,
double *a, int *lda, double *vl, double *vu, int *
* the research papers on the package. Check out http://www.gromacs.org.
*/
+/* The macros in this file are intended to be used for writing
+ * architecture independent SIMD intrinsics code.
+ * To support a new architecture, adding macros here should be (nearly)
+ * all that is needed.
+ */
+
/* Undefine all defines used below so we can include this file multiple times
* with different settings from the same source file.
*/
/* NOTE: floor and blend are NOT available with SSE2 only acceleration */
-#undef GMX_X86_SIMD_WIDTH_HERE
+#undef GMX_SIMD_WIDTH_HERE
#undef gmx_epi32
*/
#if !defined GMX_MM128_HERE && !defined GMX_MM256_HERE
-"You should define GMX_MM128_HERE or GMX_MM256_HERE"
+#error "You should define GMX_MM128_HERE or GMX_MM256_HERE"
#endif
#if defined GMX_MM128_HERE && defined GMX_MM256_HERE
-"You should not define both GMX_MM128_HERE and GMX_MM256_HERE"
+#error "You should not define both GMX_MM128_HERE and GMX_MM256_HERE"
#endif
#ifdef GMX_MM128_HERE
#include "gmx_x86_simd_single.h"
-#define GMX_X86_SIMD_WIDTH_HERE 4
+#define GMX_SIMD_WIDTH_HERE 4
#define gmx_mm_pr __m128
#include "gmx_x86_simd_double.h"
-#define GMX_X86_SIMD_WIDTH_HERE 2
+#define GMX_SIMD_WIDTH_HERE 2
#define gmx_mm_pr __m128d
#include "gmx_x86_simd_single.h"
-#define GMX_X86_SIMD_WIDTH_HERE 8
+#define GMX_SIMD_WIDTH_HERE 8
#define gmx_mm_pr __m256
#define gmx_pmecorrF_pr gmx_mm256_pmecorrF_ps
#define gmx_pmecorrV_pr gmx_mm256_pmecorrV_ps
+#define gmx_loaddh_pr gmx_mm256_load4_ps
+
+/* Half SIMD-width type */
+#define gmx_mm_hpr __m128
+
+/* Half SIMD-width macros */
+#define gmx_load_hpr _mm_load_ps
+#define gmx_load1_hpr(x) _mm_set1_ps((x)[0])
+#define gmx_store_hpr _mm_store_ps
+#define gmx_add_hpr _mm_add_ps
+#define gmx_sub_hpr _mm_sub_ps
+
+#define gmx_sum4_hpr gmx_mm256_sum4h_m128
+
+/* Conversion between half and full SIMD-width */
+#define gmx_2hpr_to_pr gmx_mm256_set_m128
+
#else
#include "gmx_x86_simd_double.h"
-#define GMX_X86_SIMD_WIDTH_HERE 4
+#define GMX_SIMD_WIDTH_HERE 4
#define gmx_mm_pr __m256d
}
+static gmx_inline __m256
+gmx_mm256_load4_ps(float const * p)
+{
+ __m128 a;
+
+ a = _mm_load_ps(p);
+ return _mm256_insertf128_ps(_mm256_castps128_ps256(a), a, 0x1);
+}
+
+
static __m256d
gmx_mm256_unpack128lo_pd(__m256d xmm1, __m256d xmm2)
{
}
+static __m128 gmx_mm256_sum4h_m128(__m256 x, __m256 y)
+{
+ __m256 sum;
+
+ sum = _mm256_add_ps(x,y);
+ return _mm_add_ps(_mm256_castps256_ps128(sum),_mm256_extractf128_ps(sum,0x1));
+}
static void
* The string name is used to print to the log file and in a fatal error
* if the val's don't match.
*/
-
+GMX_LIBGMX_EXPORT
void init_multisystem(t_commrec *cr, int nsim, char **multidirs,
int nfile, const t_filenm fnm[], gmx_bool bParFn);
/* Splits the communication into nsim separate simulations
real cuberoot (real a);
GMX_LIBGMX_EXPORT
double gmx_erfd(double x);
+GMX_LIBGMX_EXPORT
double gmx_erfcd(double x);
GMX_LIBGMX_EXPORT
float gmx_erff(float x);
typedef struct gmx_ana_nbsearch_t gmx_ana_nbsearch_t;
/** Create a new neighborhood search data structure. */
+GMX_LIBGMX_EXPORT
int
gmx_ana_nbsearch_create(gmx_ana_nbsearch_t **d, real cutoff, int maxn);
/** Free memory allocated for neighborhood search. */
int
gmx_ana_nbsearch_init(gmx_ana_nbsearch_t *d, t_pbc *pbc, int n, rvec x[]);
/** Initializes neighborhood search for a frame using \c gmx_ana_pos_t. */
+GMX_LIBGMX_EXPORT
int
gmx_ana_nbsearch_pos_init(gmx_ana_nbsearch_t *d, t_pbc *pbc,
struct gmx_ana_pos_t *p);
real
gmx_ana_nbsearch_mindist(gmx_ana_nbsearch_t *d, rvec x);
/** Calculates the minimun distance from the reference points. */
+GMX_LIBGMX_EXPORT
real
gmx_ana_nbsearch_pos_mindist(gmx_ana_nbsearch_t *d,
struct gmx_ana_pos_t *p, int i);
void gmx_sumf_sim(int nr,float r[],const gmx_multisim_t *ms);
/* Calculate the sum over the simulations of an array of floats */
+GMX_LIBGMX_EXPORT
void gmx_sumd_sim(int nr,double r[],const gmx_multisim_t *ms);
/* Calculate the sum over the simulations of an array of doubles */
/* we put all of these on their own cache line by padding the data structure
to the size of a cache line on x86 (64 bytes): */
+#define TMPI_SIZEOF_X86_CACHE_LINE 64
typedef struct tMPI_Atomic
{
int value;
- char padding[64-sizeof(int)];
+ char padding[TMPI_SIZEOF_X86_CACHE_LINE-sizeof(int)];
} tMPI_Atomic_t;
typedef struct tMPI_Atomic_ptr
{
void* value;
- char padding[64-sizeof(void*)];
+ char padding[TMPI_SIZEOF_X86_CACHE_LINE-sizeof(void*)];
} tMPI_Atomic_ptr_t;
typedef struct tMPI_Spinlock
{
unsigned int lock;
- char padding[64-sizeof(unsigned int)];
+ char padding[TMPI_SIZEOF_X86_CACHE_LINE-sizeof(unsigned int)];
} tMPI_Spinlock_t;
as the 486, and gcc on some Linux versions still target 80386 by default).
We also specifically check for icc, because intrinsics are not always
- supported there. */
-#if ( (TMPI_GCC_VERSION >= 40100) && defined(__x86_64__) && \
- !defined(__INTEL_COMPILER) )
+ supported there.
+
+ llvm has issues with inline assembly and also in 32 bits has support for
+ the gcc intrinsics */
+#if ( ( (TMPI_GCC_VERSION >= 40100) && defined(__x86_64__) && \
+ !defined(__INTEL_COMPILER) ) || defined(__llvm__) )
#include "gcc_intrinsics.h"
#else
__asm__ __volatile__("lock ; xaddl %0, %1;"
:"=r"(i) :"m"(a->value), "0"(i) : "memory");
return i + __i;
-}
+}
static inline int tMPI_Atomic_fetch_add(tMPI_Atomic_t *a, int i)
{
static inline int tMPI_Atomic_cas(tMPI_Atomic_t *a, int oldval, int newval)
{
- unsigned int prev;
+ int prev;
__asm__ __volatile__("lock ; cmpxchgl %1,%2"
: "=a"(prev)
int
gmx_ana_add_flags(gmx_ana_traj_t *d, unsigned long flags);
/** Sets the number of reference groups required. */
+GMX_LIBGMX_EXPORT
int
gmx_ana_set_nrefgrps(gmx_ana_traj_t *d, int nrefgrps);
/** Sets the number of analysis groups required. */
int
gmx_ana_get_nanagrps(gmx_ana_traj_t *d, int *nanagrps);
/** Gets the selection object for a reference selection. */
+GMX_LIBGMX_EXPORT
int
gmx_ana_get_refsel(gmx_ana_traj_t *d, int i, gmx_ana_selection_t **sel);
/** Gets the selection object for a reference selection. */
extern "C" {
#endif
-/*! Nonbonded NxN kernel types: plain C, SSE/AVX, GPU CUDA, GPU emulation, etc */
-enum { nbkNotSet = 0,
- nbk4x4_PlainC,
- nbk4xN_X86_SIMD128,
- nbk4xN_X86_SIMD256,
- nbk8x8x8_CUDA,
- nbk8x8x8_PlainC };
+#ifdef GMX_X86_SSE2
+/* Use SIMD accelerated nbnxn search and kernels */
+#define GMX_NBNXN_SIMD
+
+#ifdef GMX_X86_AVX_256
+/* Comment out this define to use AVX-128 kernels with AVX-256 acceleration */
+#define GMX_NBNXN_SIMD_BITWIDTH 256
+#else
+#define GMX_NBNXN_SIMD_BITWIDTH 128
+#endif
+
+/* The nbnxn SIMD 4xN and 2x(N+N) kernels can be added independently.
+ * Currently the 2xNN SIMD kernels only make sense and are only implemented
+ * with AVX-256 in single precision using a 4x4 cluster setup instead of 4x8.
+ */
+#define GMX_NBNXN_SIMD_4XN
+#if GMX_NBNXN_SIMD_BITWIDTH == 256 && !defined GMX_DOUBLE
+#define GMX_NBNXN_SIMD_2XNN
+#endif
+
+#endif
+
+
+/*! Nonbonded NxN kernel types: plain C, CPU SIMD, GPU CUDA, GPU emulation */
+typedef enum
+{
+ nbnxnkNotSet = 0,
+ nbnxnk4x4_PlainC,
+ nbnxnk4xN_SIMD_4xN,
+ nbnxnk4xN_SIMD_2xNN,
+ nbnxnk8x8x8_CUDA,
+ nbnxnk8x8x8_PlainC,
+ nbnxnkNR
+} nbnxn_kernel_type;
/* Note that _mm_... intrinsics can be converted to either SSE or AVX
* depending on compiler flags.
* For gcc we check for __AVX__
* At least a check for icc should be added (if there is a macro)
*/
-static const char *nbk_name[] =
- { "not set", "plain C 4x4",
-#if !(defined GMX_X86_AVX_256 || defined GMX_X86_AVX128_FMA || defined __AVX__)
+static const char *nbnxn_kernel_name[nbnxnkNR] =
+ { "not set", "plain C",
+#if !(defined GMX_X86_SSE2)
+ "not available", "not available",
+#else
+#if GMX_NBNXN_SIMD_BITWIDTH == 128
+#if !(defined GMX_X86_AVX_128_FMA || defined __AVX__)
#ifndef GMX_X86_SSE4_1
-#ifndef GMX_DOUBLE
- "SSE2 4x4",
+ "SSE2", "SSE2",
#else
- "SSE2 4x2",
+ "SSE4.1", "SSE4.1",
#endif
#else
-#ifndef GMX_DOUBLE
- "SSE4.1 4x4",
-#else
- "SSE4.1 4x2",
+ "AVX-128", "AVX-128",
#endif
-#endif
-#else
-#ifndef GMX_DOUBLE
- "AVX-128 4x4",
#else
- "AVX-128 4x2",
+ "AVX-256", "AVX-256",
#endif
#endif
-#ifndef GMX_DOUBLE
- "AVX-256 4x8",
-#else
- "AVX-256 4x4",
-#endif
- "CUDA 8x8x8", "plain C 8x8x8" };
+ "CUDA", "plain C" };
enum { ewaldexclTable, ewaldexclAnalytical };
/* non-bonded data structure with Verlet-type cut-off */
typedef struct {
- nbnxn_search_t nbs; /* n vs n atom pair searching data */
- int ngrp; /* number of interaction groups */
- nonbonded_verlet_group_t grp[2];/* local and non-local interaction group */
+ nbnxn_search_t nbs; /* n vs n atom pair searching data */
+ int ngrp; /* number of interaction groups */
+ nonbonded_verlet_group_t grp[2];/* local and non-local interaction group */
gmx_bool bUseGPU; /* TRUE when GPU acceleration is used */
nbnxn_cuda_ptr_t cu_nbv; /* pointer to CUDA nb verlet data */
unsigned excl; /* The exclusion (interaction) bits */
} nbnxn_cj_t;
+/* In nbnxn_ci_t the integer shift contains the shift in the lower 7 bits.
+ * The upper bits contain information for non-bonded kernel optimization.
+ * Simply calculating LJ and Coulomb for all pairs in a cluster pair is fine.
+ * But three flags can be used to skip interactions, currently only for subc=0
+ * !(shift & NBNXN_CI_DO_LJ(subc)) => we can skip LJ for all pairs
+ * shift & NBNXN_CI_HALF_LJ(subc) => we can skip LJ for the second half of i
+ * !(shift & NBNXN_CI_DO_COUL(subc)) => we can skip Coulomb for all pairs
+ */
#define NBNXN_CI_SHIFT 127
#define NBNXN_CI_DO_LJ(subc) (1<<(7+3*(subc)))
#define NBNXN_CI_HALF_LJ(subc) (1<<(8+3*(subc)))
/* Simple pair-list i-unit */
typedef struct {
int ci; /* i-cluster */
- int shift; /* Shift vector index plus possible flags */
+ int shift; /* Shift vector index plus possible flags, see above */
int cj_ind_start; /* Start index into cj */
int cj_ind_end; /* End index into cj */
} nbnxn_ci_t;
#define INVSQRT_DONE
#endif /* gmx_invsqrt */
-#ifdef GMX_POWERPC_SQRT
-static real gmx_powerpc_invsqrt(real x)
-{
- const real half=0.5;
- const real three=3.0;
- t_convert result,bit_pattern;
- unsigned int exp,fract;
- real lu;
- real y;
-#ifdef GMX_DOUBLE
- real y2;
-#endif
-
- lu = __frsqrte((double)x);
-
- y=(half*lu*(three-((x*lu)*lu)));
-
-#if (GMX_POWERPC_SQRT==2)
- /* Extra iteration required */
- y=(half*y*(three-((x*y)*y)));
-#endif
-
-#ifdef GMX_DOUBLE
- y2=(half*y*(three-((x*y)*y)));
-
- return y2; /* 10 Flops */
-#else
- return y; /* 5 Flops */
-#endif
-}
-#define gmx_invsqrt(x) gmx_powerpc_invsqrt(x)
-#define INVSQRT_DONE
-#endif /* powerpc_invsqrt */
-
#ifndef INVSQRT_DONE
# ifdef GMX_DOUBLE
# ifdef HAVE_RSQRT
# If you only use one shell you can copy that GMXRC.* instead.
-# only csh/tcsh understand 'set'
-set is_csh = 123
-test "$is_csh" = 123 && goto CSH
+# only csh/tcsh set the variable $shell (note: lower case!)
+test $shell && goto CSH
# if we got here, shell is bsh/bash/zsh/ksh
# bsh cannot remove part of a variable with %%
RENAME CMakeLists.txt
COMPONENT development)
+file(GLOB_RECURSE GROMACS_HEADERS ${CMAKE_SOURCE_DIR}/include *.h)
+add_custom_command(OUTPUT gromacs
+ COMMAND ${CMAKE_COMMAND} -E copy_directory ${CMAKE_SOURCE_DIR}/include gromacs
+ DEPENDS ${GROMACS_HEADERS})
+add_custom_target(gromacs_include_links DEPENDS gromacs)
+
+option(GMX_BUILD_TEMPLATE "Build gromacs template program" ON)
+mark_as_advanced(GMX_BUILD_TEMPLATE)
+# GMX_PREFER_STATIC_OPENMP=yes is a special case to build binaries
+# to distribute and as the template is not installed it can be
+# ignored.
+# The template is build in a user-like environment, hence we use
+# flags from PKG_CFLAGS. Again GMX_PREFER_STATIC_OPENMP=yes would
+# need special link flags (OpenMP_LINKER_FLAGS), which are not
+# very user-like.
+if (GMX_BUILD_TEMPLATE AND NOT GMX_PREFER_STATIC_OPENMP)
+ add_executable(template template.c)
+ remove_definitions( -DHAVE_CONFIG_H )
+ add_definitions("${PKG_CFLAGS}")
+ target_link_libraries(template gmx)
+ include_directories("${CMAKE_CURRENT_BINARY_DIR}")
+ add_dependencies(template gromacs_include_links)
+endif()
+
install(FILES README template.c Makefile.pkg
DESTINATION ${DATA_INSTALL_DIR}/template
COMPONENT development)
#include <gromacs/pbc.h>
#include <gromacs/smalloc.h>
#include <gromacs/statutil.h>
-#include <gromacs/vec.h>
#include <gromacs/xvgr.h>
+#include <gromacs/gmx_fatal.h>
#include <gromacs/nbsearch.h>
#include <gromacs/trajana.h>
/* Turn off all water neighborlist optimization - not used right now */
#cmakedefine DISABLE_WATER_NLIST
-/* Fortran support */
-#cmakedefine GMX_FORTRAN
-
-/* Define to a macro mangling the given C identifier (in lower and upper
- case), which must not contain underscores, for linking with Fortran. */
-#define F77_FUNC(name,NAME) @F77_FUNCDEF@
-
-/* As F77_FUNC, but for C identifiers containing underscores. */
-#define F77_FUNC_(name,NAME) @F77_FUNCDEF_@
-
/* IEEE754 floating-point format. Memory layout is defined by macros
* GMX_IEEE754_BIG_ENDIAN_BYTE_ORDER and GMX_IEEE754_BIG_ENDIAN_WORD_ORDER.
*/
/* Use assembly intrinsics kernels for BlueGene */
#cmakedefine GMX_BLUEGENE
-/* Power6 acceleration */
-#cmakedefine GMX_POWER6
-
/* Work around broken calloc() */
#cmakedefine GMX_BROKEN_CALLOC
/* Use the GROMACS software 1/sqrt(x) */
#cmakedefine GMX_SOFTWARE_INVSQRT
-/* Use the PowerPC hardware 1/sqrt(x) */
-#cmakedefine GMX_POWERPC_INVSQRT
-
/* Use sub-counters */
#cmakedefine GMX_CYCLE_SUBCOUNTERS
/* Build special-purpose mdrun library */
#cmakedefine GMX_FAHCORE
+/* Disable gromacs quotes */
+#cmakedefine GMX_NO_QUOTES
+
#ifdef GMX_FAHCORE
#define FULLINDIRECT 1
#define USE_FAH_XDR 1
* but we dont call this routine often, and it avoids using
* a mutex for locking the variable...
*/
-#ifdef GMX_FAHCORE
+#if defined(GMX_FAHCORE) || defined(GMX_NO_QUOTES)
/*be uncool*/
return FALSE;
#else
* name of a file. Otherwise, we won't be able to find the library dir.
*/
#define NCR (int)asize(CopyrightText)
+/* TODO: Is this exception still needed? */
#ifdef GMX_FAHCORE
-#define NGPL 0 /*FAH has an exception permission from GPL to allow digital signatures in Gromacs*/
+#define NLICENSE 0 /*FAH has an exception permission from GPL to allow digital signatures in Gromacs*/
#else
-#define NGPL (int)asize(GPLText)
+#define NLICENSE (int)asize(LicenseText)
#endif
char buf[256],tmpstr[1024];
for(i=0; (i<NCR); i++)
sp_print(out,CopyrightText[i]);
- for(i=0; (i<NGPL); i++)
- sp_print(out,GPLText[i]);
+ for(i=0; (i<NLICENSE); i++)
+ sp_print(out,LicenseText[i]);
fprintf(out,"\n");
#include <unistd.h>
#endif
+#include "gmx_cpuid.h"
-#include "gmx_cpuid.h"
-
+/* For convenience, and to enable configure-time invocation, we keep all architectures
+ * in a single file, but to avoid repeated ifdefs we set the overall architecture here.
+ */
+#if defined (__i386__) || defined (__x86_64__) || defined (_M_IX86) || defined (_M_X64)
+# define GMX_CPUID_X86
+#endif
/* Global constant character strings corresponding to our enumerated types */
const char *
#endif
-/* Currently CPUID is only supported (1) if we can use an instruction on MSVC, or (2)
- * if the compiler handles GNU-style inline assembly.
- */
-#if defined (__i386__) || defined (__x86_64__) || defined (_M_IX86) || defined (_M_X64)
+#ifdef GMX_CPUID_X86
/* Execute CPUID on x86 class CPUs. level sets function to exec, and the
* contents of register output is returned. See Intel/AMD docs for details.
{
int rc = 0;
+ /* Currently CPUID is only supported (1) if we can use an instruction on MSVC, or (2)
+ * if the compiler handles GNU-style inline assembly.
+ */
+
#if (defined _MSC_VER)
int CPUInfo[4];
#endif
return rc;
}
-#endif /* architecture is x86 */
/* Identify CPU features common to Intel & AMD - mainly brand string,
}
return 0;
}
+#endif /* GMX_CPUID_X86 */
+
+
/* Try to find the vendor of the current CPU, so we know what specific
* detection routine to call.
/* Set default first */
vendor = GMX_CPUID_VENDOR_UNKNOWN;
+#ifdef GMX_CPUID_X86
execute_x86cpuid(0x0,0,&eax,&ebx,&ecx,&edx);
memcpy(vendorstring,&ebx,4);
vendor = i;
}
}
-
+#else
+ vendor = GMX_CPUID_VENDOR_UNKNOWN;
+#endif
+
return vendor;
}
switch(cpuid->vendor)
{
+#ifdef GMX_CPUID_X86
case GMX_CPUID_VENDOR_INTEL:
cpuid_check_intel_x86(cpuid);
break;
case GMX_CPUID_VENDOR_AMD:
cpuid_check_amd_x86(cpuid);
break;
+#endif
default:
/* Could not find vendor */
strncpy(cpuid->brand,"Unknown CPU brand",GMX_CPUID_BRAND_MAXLEN);
enum gmx_cpuid_x86_smt
gmx_cpuid_x86_smt(gmx_cpuid_t cpuid)
{
-
+#ifdef GMX_CPUID_X86
#if (defined HAVE_SCHED_H && defined HAVE_SCHED_SETAFFINITY && defined HAVE_SYSCONF && defined __linux__)
int i;
int nproc;
return GMX_CPUID_X86_SMT_CANNOTDETECT;
}
#endif
+#else
+ /* not x86 */
+ return GMX_CPUID_X86_SMT_CANNOTDETECT;
+#endif
}
* ridiculous number. */
static unsigned int max_gpu_ids_user = 64;
+static const char* invalid_gpuid_hint =
+ "A delimiter-free sequence of valid numeric IDs of available GPUs is expected.";
+
/* FW decl. */
void limit_num_gpus_used(gmx_hw_info_t *hwinfo, int count);
{
if (idstr[i] < '0' || idstr[i] > '9')
{
- gmx_fatal(FARGS, "Invalid character in GPU ID string: '%c'\n", idstr[i]);
+ gmx_fatal(FARGS, "Invalid character in GPU ID string: '%c'\n%s\n",
+ idstr[i], invalid_gpuid_hint);
}
idlist[i] = idstr[i] - '0';
}
bGPUBin = FALSE;
#endif
- /* Bail if binary is not compiled with GPU on */
+ /* Bail if binary is not compiled with GPU acceleration, but this is either
+ * explicitly (-nb gpu) or implicitly (gpu ID passed) requested. */
if (bForceUseGPU && !bGPUBin)
{
- gmx_fatal_collective(FARGS, cr, NULL, "GPU acceleration requested, but %s was compiled without GPU support!", ShortProgram());
+ gmx_fatal(FARGS, "GPU acceleration requested, but %s was compiled without GPU support!", ShortProgram());
+ }
+ if (gpu_id != NULL && !bGPUBin)
+ {
+ gmx_fatal(FARGS, "GPU ID string set, but %s was compiled without GPU support!", ShortProgram());
}
/* run the detection if the binary was compiled with GPU support */
if (nid == 0)
{
- gmx_fatal(FARGS, "Empty GPU ID string passed\n");
+ gmx_fatal(FARGS, "Empty GPU ID string encountered.\n%s\n", invalid_gpuid_hint);
}
res = check_select_cuda_gpus(checkres, &hwinfo->gpu_info, gpuid, nid);
OPTIONS ${_os_def}
RELWITHDEBINFO -g
DEBUG -g -D_DEBUG_=1 )
+#Because this is a static library linked into the (potential) shared library
+#it should have the export of the shared library.
+SET_TARGET_PROPERTIES(gpu_utils PROPERTIES DEFINE_SYMBOL "gmx_EXPORTS" )
CUDA_BUILD_CLEAN_TARGET()
URL: http://www.gromacs.org
Version: @PROJECT_VERSION@
Requires:
-Libs.private: @CMAKE_THREAD_LIBS_INIT@ @PKG_DL_LIBS@
+Libs.private: @CMAKE_THREAD_LIBS_INIT@ @PKG_DL_LIBS@ @OpenMP_LINKER_FLAGS@
Libs: -L${libdir} -lgmx@GMX_LIBS_SUFFIX@ -lm
Cflags: -I${includedir} @PKG_CFLAGS@
t1 = _mm_unpacklo_pd(_mm_load_sd(ptrA),_mm_load_sd(ptrB));
t2 = _mm_unpacklo_pd(_mm_load_sd(ptrC),_mm_load_sd(ptrD));
- return gmx_mm256_set_m128(t2,t1);
+ return gmx_mm256_set_m128d(t2,t1);
}
{
__m256d t1,t2;
- t1 = gmx_mm256_set_m128(_mm_loadu_pd(p3),_mm_loadu_pd(p1)); /* c12c c6c | c12a c6a */
- t2 = gmx_mm256_set_m128(_mm_loadu_pd(p4),_mm_loadu_pd(p2)); /* c12d c6d | c12b c6b */
+ t1 = gmx_mm256_set_m128d(_mm_loadu_pd(p3),_mm_loadu_pd(p1)); /* c12c c6c | c12a c6a */
+ t2 = gmx_mm256_set_m128d(_mm_loadu_pd(p4),_mm_loadu_pd(p2)); /* c12d c6d | c12b c6b */
*c6 = _mm256_unpacklo_pd(t1,t2); /* c6d c6c | c6b c6a */
*c12 = _mm256_unpackhi_pd(t1,t2); /* c12d c12c | c12b c12a */
ty = _mm_shuffle_pd(mem_xy,mem_xy,_MM_SHUFFLE2(1,1));
tz = _mm_shuffle_pd(mem_z,mem_z,_MM_SHUFFLE2(0,0));
- *x1 = gmx_mm256_set_m128(tx,tx);
- *y1 = gmx_mm256_set_m128(ty,ty);
- *z1 = gmx_mm256_set_m128(tz,tz);
+ *x1 = gmx_mm256_set_m128d(tx,tx);
+ *y1 = gmx_mm256_set_m128d(ty,ty);
+ *z1 = gmx_mm256_set_m128d(tz,tz);
}
tx = _mm_shuffle_pd(t1,t1,_MM_SHUFFLE2(0,0));
ty = _mm_shuffle_pd(t1,t1,_MM_SHUFFLE2(1,1));
tz = _mm_shuffle_pd(t2,t2,_MM_SHUFFLE2(0,0));
- *x1 = gmx_mm256_set_m128(tx,tx);
- *y1 = gmx_mm256_set_m128(ty,ty);
- *z1 = gmx_mm256_set_m128(tz,tz);
+ *x1 = gmx_mm256_set_m128d(tx,tx);
+ *y1 = gmx_mm256_set_m128d(ty,ty);
+ *z1 = gmx_mm256_set_m128d(tz,tz);
tx = _mm_shuffle_pd(t2,t2,_MM_SHUFFLE2(1,1));
ty = _mm_shuffle_pd(t3,t3,_MM_SHUFFLE2(0,0));
tz = _mm_shuffle_pd(t3,t3,_MM_SHUFFLE2(1,1));
- *x2 = gmx_mm256_set_m128(tx,tx);
- *y2 = gmx_mm256_set_m128(ty,ty);
- *z2 = gmx_mm256_set_m128(tz,tz);
+ *x2 = gmx_mm256_set_m128d(tx,tx);
+ *y2 = gmx_mm256_set_m128d(ty,ty);
+ *z2 = gmx_mm256_set_m128d(tz,tz);
tx = _mm_shuffle_pd(t4,t4,_MM_SHUFFLE2(0,0));
ty = _mm_shuffle_pd(t4,t4,_MM_SHUFFLE2(1,1));
tz = _mm_shuffle_pd(t5,t5,_MM_SHUFFLE2(0,0));
- *x3 = gmx_mm256_set_m128(tx,tx);
- *y3 = gmx_mm256_set_m128(ty,ty);
- *z3 = gmx_mm256_set_m128(tz,tz);
+ *x3 = gmx_mm256_set_m128d(tx,tx);
+ *y3 = gmx_mm256_set_m128d(ty,ty);
+ *z3 = gmx_mm256_set_m128d(tz,tz);
}
tx = _mm_shuffle_pd(t1,t1,_MM_SHUFFLE2(0,0));
ty = _mm_shuffle_pd(t1,t1,_MM_SHUFFLE2(1,1));
tz = _mm_shuffle_pd(t2,t2,_MM_SHUFFLE2(0,0));
- *x1 = gmx_mm256_set_m128(tx,tx);
- *y1 = gmx_mm256_set_m128(ty,ty);
- *z1 = gmx_mm256_set_m128(tz,tz);
+ *x1 = gmx_mm256_set_m128d(tx,tx);
+ *y1 = gmx_mm256_set_m128d(ty,ty);
+ *z1 = gmx_mm256_set_m128d(tz,tz);
tx = _mm_shuffle_pd(t2,t2,_MM_SHUFFLE2(1,1));
ty = _mm_shuffle_pd(t3,t3,_MM_SHUFFLE2(0,0));
tz = _mm_shuffle_pd(t3,t3,_MM_SHUFFLE2(1,1));
- *x2 = gmx_mm256_set_m128(tx,tx);
- *y2 = gmx_mm256_set_m128(ty,ty);
- *z2 = gmx_mm256_set_m128(tz,tz);
+ *x2 = gmx_mm256_set_m128d(tx,tx);
+ *y2 = gmx_mm256_set_m128d(ty,ty);
+ *z2 = gmx_mm256_set_m128d(tz,tz);
tx = _mm_shuffle_pd(t4,t4,_MM_SHUFFLE2(0,0));
ty = _mm_shuffle_pd(t4,t4,_MM_SHUFFLE2(1,1));
tz = _mm_shuffle_pd(t5,t5,_MM_SHUFFLE2(0,0));
- *x3 = gmx_mm256_set_m128(tx,tx);
- *y3 = gmx_mm256_set_m128(ty,ty);
- *z3 = gmx_mm256_set_m128(tz,tz);
+ *x3 = gmx_mm256_set_m128d(tx,tx);
+ *y3 = gmx_mm256_set_m128d(ty,ty);
+ *z3 = gmx_mm256_set_m128d(tz,tz);
tx = _mm_shuffle_pd(t5,t5,_MM_SHUFFLE2(1,1));
ty = _mm_shuffle_pd(t6,t6,_MM_SHUFFLE2(0,0));
tz = _mm_shuffle_pd(t6,t6,_MM_SHUFFLE2(1,1));
- *x4 = gmx_mm256_set_m128(tx,tx);
- *y4 = gmx_mm256_set_m128(ty,ty);
- *z4 = gmx_mm256_set_m128(tz,tz);
+ *x4 = gmx_mm256_set_m128d(tx,tx);
+ *y4 = gmx_mm256_set_m128d(ty,ty);
+ *z4 = gmx_mm256_set_m128d(tz,tz);
}
tA = _mm_add_pd(_mm256_castpd256_pd128(fix1),_mm256_extractf128_pd(fix1,0x1));
tB = _mm_add_pd(_mm256_castpd256_pd128(fiz1),_mm256_extractf128_pd(fiz1,0x1));
- fix1 = gmx_mm256_set_m128(tB,tA); /* 0 fiz fiy fix */
+ fix1 = gmx_mm256_set_m128d(tB,tA); /* 0 fiz fiy fix */
t1 = _mm256_loadu_pd(fptr);
t2 = _mm256_loadu_pd(fshiftptr);
tB = _mm_add_pd(_mm256_castpd256_pd128(fiz1),_mm256_extractf128_pd(fiz1,0x1)); /* fix2 fiz1 */
tC = _mm_add_pd(_mm256_castpd256_pd128(fiy2),_mm256_extractf128_pd(fiy2,0x1)); /* fiz2 fiy2 */
- t1 = gmx_mm256_set_m128(tB,tA); /* fix2 fiz1 | fiy1 fix1 */
+ t1 = gmx_mm256_set_m128d(tB,tA); /* fix2 fiz1 | fiy1 fix1 */
t2 = _mm256_loadu_pd(fptr);
tD = _mm_loadu_pd(fptr+4);
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
/* We tried again, and this time there was a copied buffer.
We use that, and indicate that we're not reading from the
regular buf. This case should be pretty rare. */
- tMPI_Atomic_fetch_add(&(cev->met[rank].buf_readcount),-1);
+ tMPI_Atomic_add_return(&(cev->met[rank].buf_readcount),-1);
tMPI_Atomic_memory_barrier_acq();
srcbuf=try_again_srcbuf;
}
{
/* we decrement the read count; potentially releasing the buffer. */
tMPI_Atomic_memory_barrier_rel();
- tMPI_Atomic_fetch_add( &(cev->met[rank].buf_readcount), -1);
+ tMPI_Atomic_add_return( &(cev->met[rank].buf_readcount), -1);
}
#endif
}
else
{
/* wait until everybody else is done copying the original buffer.
- We use fetch_add because we want to be sure of coherency.
+ We use atomic add-return because we want to be sure of coherency.
This wait is bound to be very short (otherwise it wouldn't
be double-buffering) so we always spin here. */
/*tMPI_Atomic_memory_barrier_rel();*/
-100000))
#endif
#if 0
- while (tMPI_Atomic_fetch_add( &(cev->met[myrank].buf_readcount), 0)
+ while (tMPI_Atomic_add_return( &(cev->met[myrank].buf_readcount), 0)
!= 0)
#endif
#if 1
tMPI_Atomic_memory_barrier_rel();
/* signal that we're done */
- tMPI_Atomic_fetch_add(&(cev->coll.current_sync), 1);
+ tMPI_Atomic_add_return(&(cev->coll.current_sync), 1);
/* we need to keep being in sync */
csync->syncs++;
}
tMPI_Error(TMPI_COMM_WORLD, TMPI_ERR_INIT);
}
}
- /* the main thread now also runs start_fn if we don't want
+ /* the main thread also runs start_fn if we don't want
it to return */
if (!main_returns)
tMPI_Thread_starter((void*)&(threads[0]));
tMPI_Trace_print("tMPI_Init(%p, %p, %p)", argc, argv, start_function);
#endif
-
if (TMPI_COMM_WORLD==0) /* we're the main process */
{
int N=0;
tMPI_Get_N(argc, argv, "-nt", &N);
- tMPI_Start_threads(FALSE, N, TMPI_AFFINITY_ALL_CORES, argc, argv,
+ tMPI_Start_threads(TRUE, N, TMPI_AFFINITY_ALL_CORES, argc, argv,
NULL, NULL, start_function);
}
else
list(APPEND GMX_EXTRA_LIBRARIES gmxpreprocess md ${OpenMP_LINKER_FLAGS})
set(GMX_KERNEL_PROGRAMS
- grompp tpbconv pdb2gmx g_protonate g_luck gmxdump g_x2top gmxcheck)
+ grompp tpbconv pdb2gmx g_protonate gmxdump g_x2top gmxcheck)
+if (NOT GMX_NO_QUOTES)
+ set(GMX_KERNEL_PROGRAMS ${GMX_KERNEL_PROGRAMS} g_luck)
+endif (NOT GMX_NO_QUOTES)
+
foreach(PROGRAM ${GMX_KERNEL_PROGRAMS})
add_executable(${PROGRAM} ${PROGRAM}.c main.c)
}
else
{
-#ifndef GMX_X86_SSE2
+#ifndef GMX_NBNXN_SIMD
list_setup->cluster_size_j = NBNXN_CPU_CLUSTER_I_SIZE;
#else
- int simd_width;
-
-#ifdef GMX_X86_AVX_256
- simd_width = 256;
-#else
- simd_width = 128;
+ list_setup->cluster_size_j = GMX_NBNXN_SIMD_BITWIDTH/(sizeof(real)*8);
+#ifdef GMX_NBNXN_SIMD_2XNN
+ /* We assume the smallest cluster size to be on the safe side */
+ list_setup->cluster_size_j /= 2;
#endif
- list_setup->cluster_size_j = simd_width/(sizeof(real)*8);
#endif
}
}
&(ir->nkx),&(ir->nky),&(ir->nkz));
}
+ /* MRS: eventually figure out better logic for initializing the fep
+ values that makes declaring the lambda and declaring the state not
+ potentially conflict if not handled correctly. */
+ if (ir->efep != efepNO)
+ {
+ state.fep_state = ir->fepvals->init_fep_state;
+ for (i=0;i<efptNR;i++)
+ {
+ /* init_lambda trumps state definitions*/
+ if (ir->fepvals->init_lambda >= 0)
+ {
+ state.lambda[i] = ir->fepvals->init_lambda;
+ }
+ else
+ {
+ if (ir->fepvals->all_lambda[i] == NULL)
+ {
+ gmx_fatal(FARGS,"Values of lambda not set for a free energy calculation!");
+ }
+ else
+ {
+ state.lambda[i] = ir->fepvals->all_lambda[i][state.fep_state];
+ }
+ }
+ }
+ }
+
if (ir->ePull != epullNO)
- set_pull_init(ir,sys,state.x,state.box,oenv,opts->pull_start);
+ set_pull_init(ir,sys,state.x,state.box,state.lambda[efptMASS],oenv,opts->pull_start);
if (ir->bRot)
{
}
}
- /* MRS: eventually figure out better logic for initializing the fep
- values that makes declaring the lambda and declaring the state not
- potentially conflict if not handled correctly. */
- if (ir->efep != efepNO)
- {
- state.fep_state = ir->fepvals->init_fep_state;
- for (i=0;i<efptNR;i++)
- {
- /* init_lambda trumps state definitions*/
- if (ir->fepvals->init_lambda >= 0)
- {
- state.lambda[i] = ir->fepvals->init_lambda;
- }
- else
- {
- if (ir->fepvals->all_lambda[i] == NULL)
- {
- gmx_fatal(FARGS,"Values of lambda not set for a free energy calculation!");
- }
- else
- {
- state.lambda[i] = ir->fepvals->all_lambda[i][state.fep_state];
- }
- }
- }
- }
-
if (bVerbose)
fprintf(stderr,"writing run input file...\n");
{
nstfep = ir->expandedvals->nstexpanded;
}
- if (repl_ex_nst > 0 && repl_ex_nst > nstfep)
+ if (repl_ex_nst > 0 && nstfep > repl_ex_nst)
{
nstfep = repl_ex_nst;
}
{
if (bTrotter)
{
+ m_add(force_vir,shake_vir,total_vir); /* we need the un-dispersion corrected total vir here */
trotter_update(ir,step,ekind,enerd,state,total_vir,mdatoms,&MassQ,trotter_seq,ettTSEQ2);
}
else
/* at the start of step, randomize the velocities */
if (ETC_ANDERSEN(ir->etc) && EI_VV(ir->eI))
{
- gmx_bool bDoAndersenConstr;
- bDoAndersenConstr = (constr && update_randomize_velocities(ir,step,mdatoms,state,upd,&top->idef,constr));
+ gmx_bool bDoAndersenConstr,bIfRandomize;
+ bIfRandomize = update_randomize_velocities(ir,step,mdatoms,state,upd,&top->idef,constr);
+ bDoAndersenConstr = (constr && bIfRandomize);
/* if we have constraints, we have to remove the kinetic energy parallel to the bonds */
if (bDoAndersenConstr)
{
state->fep_state = lamnew;
for (i=0;i<efptNR;i++)
{
- state->lambda[i] = ir->fepvals->all_lambda[i][lamnew];
+ state_global->lambda[i] = ir->fepvals->all_lambda[i][lamnew];
}
}
/* Remaining runtime */
{
nn = rr[i].main;
}
+
if (nn[0] == '-')
{
- gmx_fatal(FARGS,"In the chosen force field there is no residue type for '%s'%s",name,bStart ? " as a starting terminus" : (bEnd ? " as an ending terminus" : ""));
+ gmx_fatal(FARGS,"In the chosen force field there is no residue type for '%s'%s",name,bStart ? ( bEnd ? " as a standalone (starting & ending) residue" : " as a starting terminus") : (bEnd ? " as an ending terminus" : ""));
}
}
ic->ewaldcoeff = set->ewaldcoeff;
bUsesSimpleTables = uses_simple_tables(ir->cutoff_scheme, nbv, 0);
- if (pme_lb->cutoff_scheme == ecutsVERLET && nbv->grp[0].kernel_type == nbk8x8x8_CUDA)
+ if (pme_lb->cutoff_scheme == ecutsVERLET &&
+ nbv->grp[0].kernel_type == nbnxnk8x8x8_CUDA)
{
nbnxn_cuda_pme_loadbal_update_param(nbv->cu_nbv,ic);
}
/* Process the pull parameters after reading the index groups */
GMX_LIBGMXPREPROCESS_EXPORT
-extern void set_pull_init(t_inputrec *ir,gmx_mtop_t *mtop,rvec *x,matrix box,
+extern void set_pull_init(t_inputrec *ir,gmx_mtop_t *mtop,rvec *x,matrix box, real lambda,
const output_env_t oenv, gmx_bool bStart);
/* Prints the initial pull group distances in x.
* If bStart adds the distance to the initial reference location.
}
}
-void set_pull_init(t_inputrec *ir,gmx_mtop_t *mtop,rvec *x,matrix box,
+void set_pull_init(t_inputrec *ir,gmx_mtop_t *mtop,rvec *x,matrix box,real lambda,
const output_env_t oenv,gmx_bool bStart)
{
t_mdatoms *md;
t_pbc pbc;
int ndim,g,m;
double t_start,tinvrate;
- real lambda=0;
rvec init;
dvec dr,dev;
- /* need to pass in the correct masses if free energy is on*/
- if (ir->efep)
- {
- lambda = ir->fepvals->all_lambda[efptMASS][ir->fepvals->init_fep_state];
- }
init_pull(NULL,ir,0,NULL,mtop,NULL,oenv,lambda,FALSE,0);
md = init_mdatoms(NULL,mtop,ir->efep);
atoms2md(mtop,ir,0,NULL,0,mtop->natoms,md);
if (ir->efep)
- update_mdatoms(md,ir->fepvals->init_lambda);
-
+ {
+ update_mdatoms(md,lambda);
+ }
pull = ir->pull;
if (pull->eGeom == epullgPOS)
ndim = 3;
# disabling GMX_BUILD_OWN_FFTW changes dependencies correctly.
add_dependencies(md gmxfftw)
endif()
-set_target_properties(md PROPERTIES OUTPUT_NAME "md${GMX_LIBS_SUFFIX}" SOVERSION ${SOVERSION} INSTALL_NAME_DIR "${LIB_INSTALL_DIR}"
+option(GMX_PREFIX_LIBMD "Change install name of libmd to libgmxmd to avoid collision with BSD's/Martin Hinner's libmd, which is used in X11 and zfs" OFF)
+mark_as_advanced(GMX_PREFIX_LIBMD)
+if (GMX_PREFIX_LIBMD)
+ set(MD_PREFIX "gmx")
+else()
+ set(MD_PREFIX)
+endif()
+set_target_properties(md PROPERTIES OUTPUT_NAME "${MD_PREFIX}md${GMX_LIBS_SUFFIX}" SOVERSION ${SOVERSION} INSTALL_NAME_DIR "${LIB_INSTALL_DIR}"
COMPILE_FLAGS "${OpenMP_C_FLAGS}")
install(TARGETS md DESTINATION ${LIB_INSTALL_DIR} COMPONENT libraries)
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/libmd.pc.cmakein ${CMAKE_CURRENT_BINARY_DIR}/libmd.pc @ONLY)
install(FILES ${CMAKE_CURRENT_BINARY_DIR}/libmd.pc
DESTINATION ${LIB_INSTALL_DIR}/pkgconfig
- RENAME "libmd${GMX_LIBS_SUFFIX}.pc"
+ RENAME "lib${MD_PREFIX}md${GMX_LIBS_SUFFIX}.pc"
COMPONENT development)
int b,i,j;
real mvb,im1,im2,tmp0,tmp1,tmp2;
- for(b=0; b<ncons; b++)
+ if (invmass != NULL)
{
- i = bla[2*b];
- j = bla[2*b+1];
- mvb = prefac*fac[b];
- im1 = invmass[i];
- im2 = invmass[j];
- tmp0 = r[b][0]*mvb;
- tmp1 = r[b][1]*mvb;
- tmp2 = r[b][2]*mvb;
- x[i][0] -= tmp0*im1;
- x[i][1] -= tmp1*im1;
- x[i][2] -= tmp2*im1;
- x[j][0] += tmp0*im2;
- x[j][1] += tmp1*im2;
- x[j][2] += tmp2*im2;
- } /* 16 ncons flops */
+ for(b=0; b<ncons; b++)
+ {
+ i = bla[2*b];
+ j = bla[2*b+1];
+ mvb = prefac*fac[b];
+ im1 = invmass[i];
+ im2 = invmass[j];
+ tmp0 = r[b][0]*mvb;
+ tmp1 = r[b][1]*mvb;
+ tmp2 = r[b][2]*mvb;
+ x[i][0] -= tmp0*im1;
+ x[i][1] -= tmp1*im1;
+ x[i][2] -= tmp2*im1;
+ x[j][0] += tmp0*im2;
+ x[j][1] += tmp1*im2;
+ x[j][2] += tmp2*im2;
+ } /* 16 ncons flops */
+ }
+ else
+ {
+ for(b=0; b<ncons; b++)
+ {
+ i = bla[2*b];
+ j = bla[2*b+1];
+ mvb = prefac*fac[b];
+ tmp0 = r[b][0]*mvb;
+ tmp1 = r[b][1]*mvb;
+ tmp2 = r[b][2]*mvb;
+ x[i][0] -= tmp0;
+ x[i][1] -= tmp1;
+ x[i][2] -= tmp2;
+ x[j][0] += tmp0;
+ x[j][1] += tmp1;
+ x[j][2] += tmp2;
+ }
+ }
}
static void lincs_update_atoms_ind(int ncons,const int *ind,const int *bla,
int bi,b,i,j;
real mvb,im1,im2,tmp0,tmp1,tmp2;
- for(bi=0; bi<ncons; bi++)
+ if (invmass != NULL)
{
- b = ind[bi];
- i = bla[2*b];
- j = bla[2*b+1];
- mvb = prefac*fac[b];
- im1 = invmass[i];
- im2 = invmass[j];
- tmp0 = r[b][0]*mvb;
- tmp1 = r[b][1]*mvb;
- tmp2 = r[b][2]*mvb;
- x[i][0] -= tmp0*im1;
- x[i][1] -= tmp1*im1;
- x[i][2] -= tmp2*im1;
- x[j][0] += tmp0*im2;
- x[j][1] += tmp1*im2;
- x[j][2] += tmp2*im2;
- } /* 16 ncons flops */
+ for(bi=0; bi<ncons; bi++)
+ {
+ b = ind[bi];
+ i = bla[2*b];
+ j = bla[2*b+1];
+ mvb = prefac*fac[b];
+ im1 = invmass[i];
+ im2 = invmass[j];
+ tmp0 = r[b][0]*mvb;
+ tmp1 = r[b][1]*mvb;
+ tmp2 = r[b][2]*mvb;
+ x[i][0] -= tmp0*im1;
+ x[i][1] -= tmp1*im1;
+ x[i][2] -= tmp2*im1;
+ x[j][0] += tmp0*im2;
+ x[j][1] += tmp1*im2;
+ x[j][2] += tmp2*im2;
+ } /* 16 ncons flops */
+ }
+ else
+ {
+ for(bi=0; bi<ncons; bi++)
+ {
+ b = ind[bi];
+ i = bla[2*b];
+ j = bla[2*b+1];
+ mvb = prefac*fac[b];
+ tmp0 = r[b][0]*mvb;
+ tmp1 = r[b][1]*mvb;
+ tmp2 = r[b][2]*mvb;
+ x[i][0] -= tmp0;
+ x[i][1] -= tmp1;
+ x[i][2] -= tmp2;
+ x[j][0] += tmp0;
+ x[j][1] += tmp1;
+ x[j][2] += tmp2;
+ } /* 16 ncons flops */
+ }
}
static void lincs_update_atoms(struct gmx_lincsdata *li,int th,
}
}
- if (econq != econqForce)
+ /* We multiply sol by blc, so we can use lincs_update_atoms for OpenMP */
+ for(b=b0; b<b1; b++)
{
- lincs_update_atoms(lincsd,th,1.0,sol,r,invmass,fp);
+ sol[b] *= blc[b];
}
- else
+
+ /* When constraining forces, we should not use mass weighting,
+ * so we pass invmass=NULL, which results in the use of 1 for all atoms.
+ */
+ lincs_update_atoms(lincsd,th,1.0,sol,r,
+ (econq != econqForce) ? invmass : NULL,fp);
+
+ if (dvdlambda != NULL)
{
+#pragma omp barrier
for(b=b0; b<b1; b++)
{
- i = bla[2*b];
- j = bla[2*b+1];
- mvb = blc[b]*sol[b];
- tmp0 = r[b][0]*mvb;
- tmp1 = r[b][1]*mvb;
- tmp2 = r[b][2]*mvb;
- fp[i][0] -= tmp0;
- fp[i][1] -= tmp1;
- fp[i][2] -= tmp2;
- fp[j][0] += tmp0;
- fp[j][1] += tmp1;
- fp[j][2] += tmp2;
- }
-
- if (dvdlambda != NULL)
- {
-#pragma omp barrier
- for(b=b0; b<b1; b++)
- {
- *dvdlambda -= blc[b]*sol[b]*lincsd->ddist[b];
- }
+ *dvdlambda -= sol[b]*lincsd->ddist[b];
}
/* 10 ncons flops */
}
*/
for(b=b0; b<b1; b++)
{
- mvb = lincsd->bllen[b]*blc[b]*sol[b];
+ mvb = lincsd->bllen[b]*sol[b];
for(i=0; i<DIM; i++)
{
tmp1 = mvb*r[b][i];
}
static void boxv_trotter(t_inputrec *ir, real *veta, real dt, tensor box,
- gmx_ekindata_t *ekind, tensor vir, real pcorr, real ecorr, t_extmass *MassQ)
+ gmx_ekindata_t *ekind, tensor vir, real pcorr, t_extmass *MassQ)
{
real pscal;
/* for now, we use Elr = 0, because if you want to get it right, you
really should be using PME. Maybe print a warning? */
- pscal = calc_pres(ir->ePBC,nwall,box,ekinmod,vir,localpres);
+ pscal = calc_pres(ir->ePBC,nwall,box,ekinmod,vir,localpres)+pcorr;
vol = det(box);
GW = (vol*(MassQ->Winv/PRESFAC))*(DIM*pscal - trace(ir->ref_p)); /* W is in ps^2 * bar * nm^3 */
case etrtBAROV:
case etrtBAROV2:
boxv_trotter(ir,&(state->veta),dt,state->box,ekind,vir,
- enerd->term[F_PDISPCORR],enerd->term[F_DISPCORR],MassQ);
+ enerd->term[F_PDISPCORR],MassQ);
break;
case etrtBARONHC:
case etrtBARONHC2:
#include "gmxcomplex.h"
#include "gmx_fft.h"
-#ifndef GMX_LIB_MPI
+#ifndef GMX_MPI
double MPI_Wtime();
#endif
nbl->table_elec.formatsize = nbl->table_elec_vdw.formatsize;
nbl->table_elec.ninteractions = 1;
nbl->table_elec.stride = nbl->table_elec.formatsize * nbl->table_elec.ninteractions;
- snew_aligned(nbl->table_elec.data,nbl->table_elec.stride*(nbl->table_elec.n+1),16);
+ snew_aligned(nbl->table_elec.data,nbl->table_elec.stride*(nbl->table_elec.n+1),32);
nbl->table_vdw.interaction = GMX_TABLE_INTERACTION_VDWREP_VDWDISP;
nbl->table_vdw.format = nbl->table_elec_vdw.format;
nbl->table_vdw.formatsize = nbl->table_elec_vdw.formatsize;
nbl->table_vdw.ninteractions = 2;
nbl->table_vdw.stride = nbl->table_vdw.formatsize * nbl->table_vdw.ninteractions;
- snew_aligned(nbl->table_vdw.data,nbl->table_vdw.stride*(nbl->table_vdw.n+1),16);
+ snew_aligned(nbl->table_vdw.data,nbl->table_vdw.stride*(nbl->table_vdw.n+1),32);
for(i=0; i<=nbl->table_elec_vdw.n; i++)
{
static void pick_nbnxn_kernel_cpu(FILE *fp,
const t_commrec *cr,
const gmx_cpuid_t cpuid_info,
+ const t_inputrec *ir,
int *kernel_type,
int *ewald_excl)
{
- *kernel_type = nbk4x4_PlainC;
+ *kernel_type = nbnxnk4x4_PlainC;
*ewald_excl = ewaldexclTable;
-#ifdef GMX_X86_SSE2
+#ifdef GMX_NBNXN_SIMD
{
- /* On Intel Sandy-Bridge AVX-256 kernels are always faster.
- * On AMD Bulldozer AVX-256 is much slower than AVX-128.
- */
- if(gmx_cpuid_feature(cpuid_info, GMX_CPUID_FEATURE_X86_AVX) == 1 &&
- gmx_cpuid_vendor(cpuid_info) != GMX_CPUID_VENDOR_AMD)
- {
-#ifdef GMX_X86_AVX_256
- *kernel_type = nbk4xN_X86_SIMD256;
-#else
- *kernel_type = nbk4xN_X86_SIMD128;
+#ifdef GMX_NBNXN_SIMD_4XN
+ *kernel_type = nbnxnk4xN_SIMD_4xN;
#endif
- }
- else
+#ifdef GMX_NBNXN_SIMD_2XNN
+ /* We expect the 2xNN kernels to be faster in most cases */
+ *kernel_type = nbnxnk4xN_SIMD_2xNN;
+#endif
+
+#if defined GMX_NBNXN_SIMD_4XN && defined GMX_X86_AVX_256
+ if (EEL_RF(ir->coulombtype) || ir->coulombtype == eelCUT)
{
- *kernel_type = nbk4xN_X86_SIMD128;
+ /* The raw pair rate of the 4x8 kernel is higher than 2x(4+4),
+ * 10% with HT, 50% without HT, but extra zeros interactions
+ * can compensate. As we currently don't detect the actual use
+ * of HT, switch to 4x8 to avoid a potential performance hit.
+ */
+ *kernel_type = nbnxnk4xN_SIMD_4xN;
}
-
- if (getenv("GMX_NBNXN_AVX128") != NULL)
+#endif
+ if (getenv("GMX_NBNXN_SIMD_4XN") != NULL)
{
- *kernel_type = nbk4xN_X86_SIMD128;
+#ifdef GMX_NBNXN_SIMD_4XN
+ *kernel_type = nbnxnk4xN_SIMD_4xN;
+#else
+ gmx_fatal(FARGS,"SIMD 4xN kernels requested, but Gromacs has been compiled without support for these kernels");
+#endif
}
- if (getenv("GMX_NBNXN_AVX256") != NULL)
+ if (getenv("GMX_NBNXN_SIMD_2XNN") != NULL)
{
-#ifdef GMX_X86_AVX_256
- *kernel_type = nbk4xN_X86_SIMD256;
+#ifdef GMX_NBNXN_SIMD_2XNN
+ *kernel_type = nbnxnk4xN_SIMD_2xNN;
#else
- gmx_fatal(FARGS,"You requested AVX-256 nbnxn kernels, but GROMACS was built without AVX support");
+ gmx_fatal(FARGS,"SIMD 2x(N+N) kernels requested, but Gromacs has been compiled without support for these kernels");
#endif
}
const gmx_hw_info_t *hwinfo,
gmx_bool use_cpu_acceleration,
gmx_bool *bUseGPU,
+ const t_inputrec *ir,
int *kernel_type,
int *ewald_excl,
gmx_bool bDoNonbonded)
assert(kernel_type);
- *kernel_type = nbkNotSet;
+ *kernel_type = nbnxnkNotSet;
*ewald_excl = ewaldexclTable;
bEmulateGPUEnvVarSet = (getenv("GMX_EMULATE_GPU") != NULL);
if (bEmulateGPU)
{
- *kernel_type = nbk8x8x8_PlainC;
+ *kernel_type = nbnxnk8x8x8_PlainC;
if (bDoNonbonded)
{
}
else if (bGPU)
{
- *kernel_type = nbk8x8x8_CUDA;
+ *kernel_type = nbnxnk8x8x8_CUDA;
}
- if (*kernel_type == nbkNotSet)
+ if (*kernel_type == nbnxnkNotSet)
{
if (use_cpu_acceleration)
{
- pick_nbnxn_kernel_cpu(fp,cr,hwinfo->cpuid_info,
+ pick_nbnxn_kernel_cpu(fp,cr,hwinfo->cpuid_info,ir,
kernel_type,ewald_excl);
}
else
{
- *kernel_type = nbk4x4_PlainC;
+ *kernel_type = nbnxnk4x4_PlainC;
}
}
if (bDoNonbonded && fp != NULL)
{
- if (MASTER(cr))
- {
- fprintf(stderr,"Using %s non-bonded kernels\n",
- nbk_name[*kernel_type]);
- }
- fprintf(fp,"\nUsing %s non-bonded kernels\n\n",
- nbk_name[*kernel_type]);
+ fprintf(fp,"\nUsing %s %dx%d non-bonded kernels\n\n",
+ nbnxn_kernel_name[*kernel_type],
+ nbnxn_kernel_pairlist_simple(*kernel_type) ? NBNXN_CPU_CLUSTER_I_SIZE : NBNXN_GPU_CLUSTER_SIZE,
+ nbnxn_kernel_to_cj_size(*kernel_type));
}
}
sfree_aligned(ic->tabq_coul_V);
/* Create the original table data in FDV0 */
- snew_aligned(ic->tabq_coul_FDV0,ic->tabq_size*4,16);
- snew_aligned(ic->tabq_coul_F,ic->tabq_size,16);
- snew_aligned(ic->tabq_coul_V,ic->tabq_size,16);
+ snew_aligned(ic->tabq_coul_FDV0,ic->tabq_size*4,32);
+ snew_aligned(ic->tabq_coul_F,ic->tabq_size,32);
+ snew_aligned(ic->tabq_coul_V,ic->tabq_size,32);
table_spline3_fill_ewald_lr(ic->tabq_coul_F,ic->tabq_coul_V,ic->tabq_coul_FDV0,
ic->tabq_size,1/ic->tabq_scale,ic->ewaldcoeff);
}
snew(ic, 1);
/* Just allocate something so we can free it */
- snew_aligned(ic->tabq_coul_FDV0,16,16);
- snew_aligned(ic->tabq_coul_F,16,16);
- snew_aligned(ic->tabq_coul_V,16,16);
+ snew_aligned(ic->tabq_coul_FDV0,16,32);
+ snew_aligned(ic->tabq_coul_F,16,32);
+ snew_aligned(ic->tabq_coul_V,16,32);
ic->rlist = fr->rlist;
ic->rlistlong = fr->rlistlong;
{
nbv->grp[i].nbl_lists.nnbl = 0;
nbv->grp[i].nbat = NULL;
- nbv->grp[i].kernel_type = nbkNotSet;
+ nbv->grp[i].kernel_type = nbnxnkNotSet;
if (i == 0) /* local */
{
pick_nbnxn_kernel(fp, cr, fr->hwinfo, fr->use_cpu_acceleration,
&nbv->bUseGPU,
+ ir,
&nbv->grp[i].kernel_type,
&nbv->grp[i].ewald_excl,
fr->bNonbonded);
/* Use GPU for local, select a CPU kernel for non-local */
pick_nbnxn_kernel(fp, cr, fr->hwinfo, fr->use_cpu_acceleration,
NULL,
+ ir,
&nbv->grp[i].kernel_type,
&nbv->grp[i].ewald_excl,
fr->bNonbonded);
for(i=0; i<nbv->ngrp; i++)
{
- if (nbv->grp[0].kernel_type == nbk8x8x8_CUDA)
+ if (nbv->grp[0].kernel_type == nbnxnk8x8x8_CUDA)
{
nb_alloc = &pmalloc;
nb_free = &pfree;
snew(state->cg_p,state->nalloc);
}
}
- if (EI_SD(ir->eI) || ir->eI == eiBD || ir->etc == etcVRESCALE) {
+ if (EI_SD(ir->eI) || ir->eI == eiBD || ir->etc == etcVRESCALE || ETC_ANDERSEN(ir->etc)) {
state->nrng = gmx_rng_n();
state->nrngi = 1;
- if (EI_SD(ir->eI) || ir->eI == eiBD) {
+ if (EI_SD(ir->eI) || ir->eI == eiBD || ETC_ANDERSEN(ir->etc)) {
/* This will be correct later with DD */
state->nrng *= nnodes;
state->nrngi *= nnodes;
{
bcast_ir_mtop(cr,inputrec,mtop);
- if (inputrec->eI == eiBD || EI_SD(inputrec->eI)) {
+ if (inputrec->eI == eiBD || EI_SD(inputrec->eI) || ETC_ANDERSEN(inputrec->etc)) {
/* Make sure the random seeds are different on each node */
inputrec->ld_seed += cr->nodeid;
}
Version: @PROJECT_VERSION@
Requires: libgmx@GMX_LIBS_SUFFIX@ @PKG_FFT@
Libs.private: -lm @CMAKE_THREAD_LIBS_INIT@
-Libs: -L${libdir} -lmd@GMX_LIBS_SUFFIX@ @PKG_FFT_LIBS@
+Libs: -L${libdir} -l@MD_PREFIX@md@GMX_LIBS_SUFFIX@ @PKG_FFT_LIBS@
Cflags: -I${includedir} @PKG_CFLAGS@
ma((void **)&out->Vvdw,out->nV*sizeof(*out->Vvdw));
ma((void **)&out->Vc ,out->nV*sizeof(*out->Vc ));
- if (nb_kernel_type == nbk4xN_X86_SIMD128 ||
- nb_kernel_type == nbk4xN_X86_SIMD256)
+ if (nb_kernel_type == nbnxnk4xN_SIMD_4xN ||
+ nb_kernel_type == nbnxnk4xN_SIMD_2xNN)
{
cj_size = nbnxn_kernel_to_cj_size(nb_kernel_type);
out->nVS = nenergrp*nenergrp*stride*(cj_size>>1)*cj_size;
nbat->lj_comb = NULL;
if (simple)
{
+ int pack_x;
+
switch (nb_kernel_type)
{
- case nbk4xN_X86_SIMD128:
- nbat->XFormat = nbatX4;
- break;
- case nbk4xN_X86_SIMD256:
-#ifndef GMX_DOUBLE
- nbat->XFormat = nbatX8;
-#else
- nbat->XFormat = nbatX4;
-#endif
+ case nbnxnk4xN_SIMD_4xN:
+ case nbnxnk4xN_SIMD_2xNN:
+ pack_x = max(NBNXN_CPU_CLUSTER_I_SIZE,
+ nbnxn_kernel_to_cj_size(nb_kernel_type));
+ switch (pack_x)
+ {
+ case 4:
+ nbat->XFormat = nbatX4;
+ break;
+ case 8:
+ nbat->XFormat = nbatX8;
+ break;
+ default:
+ gmx_incons("Unsupported packing width");
+ }
break;
default:
nbat->XFormat = nbatXYZ;
#else
#define GMX_MM128_HERE
#endif
-#include "gmx_x86_simd_macros.h"
+#include "gmx_simd_macros.h"
int i,s;
gmx_mm_pr dest_SSE,src_SSE;
if (bDestSet)
{
- for(i=i0; i<i1; i+=GMX_X86_SIMD_WIDTH_HERE)
+ for(i=i0; i<i1; i+=GMX_SIMD_WIDTH_HERE)
{
dest_SSE = gmx_load_pr(dest+i);
for(s=0; s<nsrc; s++)
}
else
{
- for(i=i0; i<i1; i+=GMX_X86_SIMD_WIDTH_HERE)
+ for(i=i0; i<i1; i+=GMX_SIMD_WIDTH_HERE)
{
dest_SSE = gmx_load_pr(src[0]+i);
for(s=1; s<nsrc; s++)
OPTIONS
RELWITHDEBINFO -g
DEBUG -g -D_DEBUG_=1)
+ #Because this is a static library linked into the (potential) shared library
+ #it should have the export of the shared library.
+ SET_TARGET_PROPERTIES(nbnxn_cuda PROPERTIES DEFINE_SYMBOL "md_EXPORTS" )
endif()
int nsubc_tot; /* Total number of subcell, used for printing */
} nbnxn_grid_t;
-#ifdef NBNXN_SEARCH_SSE
+#ifdef GMX_NBNXN_SIMD
+#if GMX_NBNXN_SIMD_BITWIDTH == 128
#define GMX_MM128_HERE
-#include "gmx_x86_simd_macros.h"
-typedef struct nbnxn_x_ci_x86_simd128 {
+#else
+#if GMX_NBNXN_SIMD_BITWIDTH == 256
+#define GMX_MM256_HERE
+#else
+#error "unsupported GMX_NBNXN_SIMD_BITWIDTH"
+#endif
+#endif
+#include "gmx_simd_macros.h"
+
+typedef struct nbnxn_x_ci_simd_4xn {
/* The i-cluster coordinates for simple search */
gmx_mm_pr ix_SSE0,iy_SSE0,iz_SSE0;
gmx_mm_pr ix_SSE1,iy_SSE1,iz_SSE1;
gmx_mm_pr ix_SSE2,iy_SSE2,iz_SSE2;
gmx_mm_pr ix_SSE3,iy_SSE3,iz_SSE3;
-} nbnxn_x_ci_x86_simd128_t;
-#undef GMX_MM128_HERE
-#ifdef GMX_X86_AVX_256
-#define GMX_MM256_HERE
-#include "gmx_x86_simd_macros.h"
-typedef struct nbnxn_x_ci_x86_simd256 {
+} nbnxn_x_ci_simd_4xn_t;
+
+typedef struct nbnxn_x_ci_simd_2xnn {
/* The i-cluster coordinates for simple search */
gmx_mm_pr ix_SSE0,iy_SSE0,iz_SSE0;
- gmx_mm_pr ix_SSE1,iy_SSE1,iz_SSE1;
gmx_mm_pr ix_SSE2,iy_SSE2,iz_SSE2;
- gmx_mm_pr ix_SSE3,iy_SSE3,iz_SSE3;
-} nbnxn_x_ci_x86_simd256_t;
-#undef GMX_MM256_HERE
-#endif
+} nbnxn_x_ci_simd_2xnn_t;
+
#endif
/* Working data for the actual i-supercell during pair search */
float *bb_ci; /* The bounding boxes, pbc shifted, for each cluster */
real *x_ci; /* The coordinates, pbc shifted, for each atom */
-#ifdef NBNXN_SEARCH_SSE
- nbnxn_x_ci_x86_simd128_t *x_ci_x86_simd128;
-#ifdef GMX_X86_AVX_256
- nbnxn_x_ci_x86_simd256_t *x_ci_x86_simd256;
-#endif
+#ifdef GMX_NBNXN_SIMD
+ nbnxn_x_ci_simd_4xn_t *x_ci_simd_4xn;
+ nbnxn_x_ci_simd_2xnn_t *x_ci_simd_2xnn;
#endif
int cj_ind; /* The current cj_ind index for the current list */
int cj4_init; /* The first unitialized cj4 block */
nbnxn_list_work_t *work);
static gmx_icell_set_x_t icell_set_x_simple;
-#ifdef NBNXN_SEARCH_SSE
-static gmx_icell_set_x_t icell_set_x_simple_x86_simd128;
-#ifdef GMX_X86_AVX_256
-static gmx_icell_set_x_t icell_set_x_simple_x86_simd256;
-#endif
+#ifdef GMX_NBNXN_SIMD
+static gmx_icell_set_x_t icell_set_x_simple_simd_4xn;
+static gmx_icell_set_x_t icell_set_x_simple_simd_2xnn;
#endif
static gmx_icell_set_x_t icell_set_x_supersub;
#ifdef NBNXN_SEARCH_SSE
static gmx_icell_set_x_t icell_set_x_supersub_sse8;
#endif
+#undef GMX_MM128_HERE
+#undef GMX_MM256_HERE
+
/* Local cycle count struct for profiling */
typedef struct {
int count;
real *nbfp_i;
int n,ci,ci_sh;
int ish,ishf;
- gmx_bool half_LJ,do_coul;
+ gmx_bool do_LJ,half_LJ,do_coul;
int cjind0,cjind1,cjind;
int ip,jp;
ci = nbln->ci;
ci_sh = (ish == CENTRAL ? ci : -1);
- half_LJ = (nbln->shift & NBNXN_CI_HALF_LJ(0));
+ /* We have 5 LJ/C combinations, but use only three inner loops,
+ * as the other combinations are unlikely and/or not much faster:
+ * inner half-LJ + C for half-LJ + C / no-LJ + C
+ * inner LJ + C for full-LJ + C
+ * inner LJ for full-LJ + no-C / half-LJ + no-C
+ */
+ do_LJ = (nbln->shift & NBNXN_CI_DO_LJ(0));
do_coul = (nbln->shift & NBNXN_CI_DO_COUL(0));
+ half_LJ = ((nbln->shift & NBNXN_CI_HALF_LJ(0)) || !do_LJ) && do_coul;
#ifdef CALC_ENERGIES
#ifndef ENERGY_GROUPS
}
}
- /* With half_LJ we currently always calculate Coulomb interactions */
- if (do_coul || half_LJ)
+ if (do_coul)
{
#ifdef CALC_ENERGIES
real Vc_sub_self;
#include "../nbnxn_consts.h"
#include "nbnxn_kernel_common.h"
-#ifdef GMX_X86_SSE2
+#ifdef GMX_NBNXN_SIMD_2XNN
-#include "nbnxn_kernel_x86_simd128.h"
+#include "nbnxn_kernel_simd_2xnn.h"
-/* Include all flavors of the 128-bit SSE or AVX kernel loops */
+/* Include all flavors of the SSE or AVX 2x(N+N) kernel loops */
+#if GMX_NBNXN_SIMD_BITWIDTH == 128
#define GMX_MM128_HERE
+#else
+#if GMX_NBNXN_SIMD_BITWIDTH == 256
+#define GMX_MM256_HERE
+#else
+#error "unsupported GMX_NBNXN_SIMD_BITWIDTH"
+#endif
+#endif
/* Analytical reaction-field kernels */
#define CALC_COUL_RF
-#include "nbnxn_kernel_x86_simd_includes.h"
+#include "nbnxn_kernel_simd_2xnn_includes.h"
#undef CALC_COUL_RF
#define CALC_COUL_TAB
/* Single cut-off: rcoulomb = rvdw */
-#include "nbnxn_kernel_x86_simd_includes.h"
+#include "nbnxn_kernel_simd_2xnn_includes.h"
/* Twin cut-off: rcoulomb >= rvdw */
#define VDW_CUTOFF_CHECK
-#include "nbnxn_kernel_x86_simd_includes.h"
+#include "nbnxn_kernel_simd_2xnn_includes.h"
#undef VDW_CUTOFF_CHECK
#undef CALC_COUL_TAB
#define CALC_COUL_EWALD
/* Single cut-off: rcoulomb = rvdw */
-#include "nbnxn_kernel_x86_simd_includes.h"
+#include "nbnxn_kernel_simd_2xnn_includes.h"
/* Twin cut-off: rcoulomb >= rvdw */
#define VDW_CUTOFF_CHECK
-#include "nbnxn_kernel_x86_simd_includes.h"
+#include "nbnxn_kernel_simd_2xnn_includes.h"
#undef VDW_CUTOFF_CHECK
#undef CALC_COUL_EWALD
enum { coultRF, coultTAB, coultTAB_TWIN, coultEWALD, coultEWALD_TWIN, coultNR };
-#define NBK_FN(elec,ljcomb) nbnxn_kernel_x86_simd128_##elec##_comb_##ljcomb##_ener
+#define NBK_FN(elec,ljcomb) nbnxn_kernel_simd_2xnn_##elec##_comb_##ljcomb##_ener
static p_nbk_func_ener p_nbk_ener[coultNR][ljcrNR] =
{ { NBK_FN(rf ,geom), NBK_FN(rf ,lb), NBK_FN(rf ,none) },
{ NBK_FN(tab ,geom), NBK_FN(tab ,lb), NBK_FN(tab ,none) },
{ NBK_FN(ewald_twin,geom), NBK_FN(ewald_twin,lb), NBK_FN(ewald_twin,none) } };
#undef NBK_FN
-#define NBK_FN(elec,ljcomb) nbnxn_kernel_x86_simd128_##elec##_comb_##ljcomb##_energrp
+#define NBK_FN(elec,ljcomb) nbnxn_kernel_simd_2xnn_##elec##_comb_##ljcomb##_energrp
static p_nbk_func_ener p_nbk_energrp[coultNR][ljcrNR] =
{ { NBK_FN(rf ,geom), NBK_FN(rf ,lb), NBK_FN(rf ,none) },
{ NBK_FN(tab ,geom), NBK_FN(tab ,lb), NBK_FN(tab ,none) },
{ NBK_FN(ewald_twin,geom), NBK_FN(ewald_twin,lb), NBK_FN(ewald_twin,none) } };
#undef NBK_FN
-#define NBK_FN(elec,ljcomb) nbnxn_kernel_x86_simd128_##elec##_comb_##ljcomb##_noener
+#define NBK_FN(elec,ljcomb) nbnxn_kernel_simd_2xnn_##elec##_comb_##ljcomb##_noener
static p_nbk_func_noener p_nbk_noener[coultNR][ljcrNR] =
{ { NBK_FN(rf ,geom), NBK_FN(rf ,lb), NBK_FN(rf ,none) },
{ NBK_FN(tab ,geom), NBK_FN(tab ,lb), NBK_FN(tab ,none) },
const real *VSvdw,const real *VSc,
real *Vvdw,real *Vc)
{
+ const int simd_width = GMX_SIMD_WIDTH_HERE;
+ const int unrollj_half = GMX_SIMD_WIDTH_HERE/4;
int ng_p2,i,j,j0,j1,c,s;
-#define SIMD_WIDTH (GMX_X86_SIMD_WIDTH_HERE)
-#define SIMD_WIDTH_HALF (GMX_X86_SIMD_WIDTH_HERE/2)
-
ng_p2 = (1<<ng_2log);
/* The size of the x86 SIMD energy group buffer array is:
- * ng*ng*ng_p2*SIMD_WIDTH_HALF*SIMD_WIDTH
+ * ng*ng*ng_p2*unrollj_half*simd_width
*/
for(i=0; i<ng; i++)
{
{
for(j0=0; j0<ng; j0++)
{
- c = ((i*ng + j1)*ng_p2 + j0)*SIMD_WIDTH_HALF*SIMD_WIDTH;
- for(s=0; s<SIMD_WIDTH_HALF; s++)
+ c = ((i*ng + j1)*ng_p2 + j0)*unrollj_half*simd_width/2;
+ for(s=0; s<unrollj_half; s++)
{
Vvdw[i*ng+j0] += VSvdw[c+0];
Vvdw[i*ng+j1] += VSvdw[c+1];
Vc [i*ng+j0] += VSc [c+0];
Vc [i*ng+j1] += VSc [c+1];
- c += SIMD_WIDTH + 2;
+ c += simd_width/2 + 2;
}
}
}
}
}
-#endif /* GMX_X86_SSE2 */
+#endif /* GMX_NBNXN_SIMD_2XNN */
void
-nbnxn_kernel_x86_simd128(nbnxn_pairlist_set_t *nbl_list,
- const nbnxn_atomdata_t *nbat,
- const interaction_const_t *ic,
- int ewald_excl,
- rvec *shift_vec,
- int force_flags,
- int clearF,
- real *fshift,
- real *Vc,
- real *Vvdw)
-#ifdef GMX_X86_SSE2
+nbnxn_kernel_simd_2xnn(nbnxn_pairlist_set_t *nbl_list,
+ const nbnxn_atomdata_t *nbat,
+ const interaction_const_t *ic,
+ int ewald_excl,
+ rvec *shift_vec,
+ int force_flags,
+ int clearF,
+ real *fshift,
+ real *Vc,
+ real *Vvdw)
+#ifdef GMX_NBNXN_SIMD_2XNN
{
int nnbl;
nbnxn_pairlist_t **nbl;
}
#else
{
- gmx_incons("nbnxn_kernel_x86_simd128 called while GROMACS was configured without SSE enabled");
+ gmx_incons("nbnxn_kernel_simd_2xnn called while GROMACS was configured without 2x(N+N) SIMD kernels enabled");
}
#endif
* To help us fund GROMACS development, we humbly ask that you cite
* the research papers on the package. Check out http://www.gromacs.org.
*/
-#ifndef _nbnxn_kernel_x86_simd128_h
-#define _nbnxn_kernel_x86_simd128_h
+#ifndef _nbnxn_kernel_simd_2xnn_h
+#define _nbnxn_kernel_simd_2xnn_h
#include "typedefs.h"
extern "C" {
#endif
-/* Wrapper call for the non-bonded cluster vs cluster kernels */
+/* Wrapper call for the non-bonded cluster vs cluster kernels.
+ * These kernels determine 4xN cluster interactions for SIMD width 2*N
+ * by packing 2*N j-atom variables in SIMD registers.
+ */
void
-nbnxn_kernel_x86_simd128(nbnxn_pairlist_set_t *nbl_list,
- const nbnxn_atomdata_t *nbat,
- const interaction_const_t *ic,
- int ewald_excl,
- rvec *shift_vec,
- int force_flags,
- int clearF,
- real *fshift,
- real *Vc,
- real *Vvdw);
+nbnxn_kernel_simd_2xnn(nbnxn_pairlist_set_t *nbl_list,
+ const nbnxn_atomdata_t *nbat,
+ const interaction_const_t *ic,
+ int ewald_excl,
+ rvec *shift_vec,
+ int force_flags,
+ int clearF,
+ real *fshift,
+ real *Vc,
+ real *Vvdw);
#ifdef __cplusplus
}
--- /dev/null
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2009, The GROMACS Development Team
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+/* This files includes all x86 SIMD kernel flavors.
+ * Only the Electrostatics type and optionally the VdW cut-off check
+ * need to be set before including this file.
+ */
+
+/* Include the force+energy kernels */
+#define CALC_ENERGIES
+#define LJ_COMB_GEOM
+#include "nbnxn_kernel_simd_2xnn_outer.h"
+#undef LJ_COMB_GEOM
+#define LJ_COMB_LB
+#include "nbnxn_kernel_simd_2xnn_outer.h"
+#undef LJ_COMB_LB
+#include "nbnxn_kernel_simd_2xnn_outer.h"
+#undef CALC_ENERGIES
+
+/* Include the force+energygroups kernels */
+#define CALC_ENERGIES
+#define ENERGY_GROUPS
+#define LJ_COMB_GEOM
+#include "nbnxn_kernel_simd_2xnn_outer.h"
+#undef LJ_COMB_GEOM
+#define LJ_COMB_LB
+#include "nbnxn_kernel_simd_2xnn_outer.h"
+#undef LJ_COMB_LB
+#include "nbnxn_kernel_simd_2xnn_outer.h"
+#undef ENERGY_GROUPS
+#undef CALC_ENERGIES
+
+/* Include the force only kernels */
+#define LJ_COMB_GEOM
+#include "nbnxn_kernel_simd_2xnn_outer.h"
+#undef LJ_COMB_GEOM
+#define LJ_COMB_LB
+#include "nbnxn_kernel_simd_2xnn_outer.h"
+#undef LJ_COMB_LB
+#include "nbnxn_kernel_simd_2xnn_outer.h"
--- /dev/null
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2009, The GROMACS Development Team
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+/* This is the innermost loop contents for the 4 x N atom SIMD kernel.
+ * This flavor of the kernel duplicates the data for N j-particles in
+ * 2xN wide SIMD registers to do operate on 2 i-particles at once.
+ * This leads to 4/2=2 sets of most instructions. Therefore we call
+ * this kernel 2x(N+N) = 2xnn
+ *
+ * This 2xnn kernel is basically the 4xn equivalent with half the registers
+ * and instructions removed.
+ *
+ * An alternative would be to load to different cluster of N j-particles
+ * into SIMD registers, giving a 4x(N+N) kernel. This doubles the amount
+ * of instructions, which could lead to better scheduling. But we actually
+ * observed worse scheduling for the AVX-256 4x8 normal analytical PME
+ * kernel, which has a lower pair throughput than 2x(4+4) with gcc 4.7.
+ * It could be worth trying this option, but it takes some more effort.
+ * This 2xnn kernel is basically the 4xn equivalent with
+ */
+
+
+/* When calculating RF or Ewald interactions we calculate the electrostatic
+ * forces on excluded atom pairs here in the non-bonded loops.
+ * But when energies and/or virial is required we calculate them
+ * separately to as then it is easier to separate the energy and virial
+ * contributions.
+ */
+#if defined CHECK_EXCLS && defined CALC_COULOMB
+#define EXCL_FORCES
+#endif
+
+/* Without exclusions and energies we only need to mask the cut-off,
+ * this can be faster with blendv (only available with SSE4.1 and later).
+ */
+#if !(defined CHECK_EXCLS || defined CALC_ENERGIES) && defined GMX_X86_SSE4_1 && !defined COUNT_PAIRS
+/* With RF and tabulated Coulomb we replace cmp+and with sub+blendv.
+ * With gcc this is slower, except for RF on Sandy Bridge.
+ * Tested with gcc 4.6.2, 4.6.3 and 4.7.1.
+ */
+#if (defined CALC_COUL_RF || defined CALC_COUL_TAB) && (!defined __GNUC__ || (defined CALC_COUL_RF && defined GMX_X86_AVX_256))
+#define CUTOFF_BLENDV
+#endif
+/* With analytical Ewald we replace cmp+and+and with sub+blendv+blendv.
+ * This is only faster with icc on Sandy Bridge (PS kernel slower than gcc 4.7).
+ * Tested with icc 13.
+ */
+#if defined CALC_COUL_EWALD && defined __INTEL_COMPILER && defined GMX_X86_AVX_256
+#define CUTOFF_BLENDV
+#endif
+#endif
+
+ {
+ int cj,aj,ajx,ajy,ajz;
+
+#ifdef ENERGY_GROUPS
+ /* Energy group indices for two atoms packed into one int */
+ int egp_jj[UNROLLJ/2];
+#endif
+
+#ifdef CHECK_EXCLS
+ /* Interaction (non-exclusion) mask of all 1's or 0's */
+ gmx_mm_pr int_SSE0;
+ gmx_mm_pr int_SSE2;
+#endif
+
+ gmx_mm_pr jxSSE,jySSE,jzSSE;
+ gmx_mm_pr dx_SSE0,dy_SSE0,dz_SSE0;
+ gmx_mm_pr dx_SSE2,dy_SSE2,dz_SSE2;
+ gmx_mm_pr tx_SSE0,ty_SSE0,tz_SSE0;
+ gmx_mm_pr tx_SSE2,ty_SSE2,tz_SSE2;
+ gmx_mm_pr rsq_SSE0,rinv_SSE0,rinvsq_SSE0;
+ gmx_mm_pr rsq_SSE2,rinv_SSE2,rinvsq_SSE2;
+#ifndef CUTOFF_BLENDV
+ /* wco: within cut-off, mask of all 1's or 0's */
+ gmx_mm_pr wco_SSE0;
+ gmx_mm_pr wco_SSE2;
+#endif
+#ifdef VDW_CUTOFF_CHECK
+ gmx_mm_pr wco_vdw_SSE0;
+#ifndef HALF_LJ
+ gmx_mm_pr wco_vdw_SSE2;
+#endif
+#endif
+#ifdef CALC_COULOMB
+#ifdef CHECK_EXCLS
+ /* 1/r masked with the interaction mask */
+ gmx_mm_pr rinv_ex_SSE0;
+ gmx_mm_pr rinv_ex_SSE2;
+#endif
+ gmx_mm_pr jq_SSE;
+ gmx_mm_pr qq_SSE0;
+ gmx_mm_pr qq_SSE2;
+#ifdef CALC_COUL_TAB
+ /* The force (PME mesh force) we need to subtract from 1/r^2 */
+ gmx_mm_pr fsub_SSE0;
+ gmx_mm_pr fsub_SSE2;
+#endif
+#ifdef CALC_COUL_EWALD
+ gmx_mm_pr brsq_SSE0,brsq_SSE2;
+ gmx_mm_pr ewcorr_SSE0,ewcorr_SSE2;
+#endif
+
+ /* frcoul = (1/r - fsub)*r */
+ gmx_mm_pr frcoul_SSE0;
+ gmx_mm_pr frcoul_SSE2;
+#ifdef CALC_COUL_TAB
+ /* For tables: r, rs=r/sp, rf=floor(rs), frac=rs-rf */
+ gmx_mm_pr r_SSE0,rs_SSE0,rf_SSE0,frac_SSE0;
+ gmx_mm_pr r_SSE2,rs_SSE2,rf_SSE2,frac_SSE2;
+ /* Table index: rs truncated to an int */
+#if !(defined GMX_MM256_HERE && defined GMX_DOUBLE)
+ gmx_epi32 ti_SSE0,ti_SSE2;
+#else
+ __m128i ti_SSE0,ti_SSE2;
+#endif
+ /* Linear force table values */
+ gmx_mm_pr ctab0_SSE0,ctab1_SSE0;
+ gmx_mm_pr ctab0_SSE2,ctab1_SSE2;
+#ifdef CALC_ENERGIES
+ /* Quadratic energy table value */
+ gmx_mm_pr ctabv_SSE0;
+ gmx_mm_pr ctabv_SSE2;
+#endif
+#endif
+#if defined CALC_ENERGIES && (defined CALC_COUL_EWALD || defined CALC_COUL_TAB)
+ /* The potential (PME mesh) we need to subtract from 1/r */
+ gmx_mm_pr vc_sub_SSE0;
+ gmx_mm_pr vc_sub_SSE2;
+#endif
+#ifdef CALC_ENERGIES
+ /* Electrostatic potential */
+ gmx_mm_pr vcoul_SSE0;
+ gmx_mm_pr vcoul_SSE2;
+#endif
+#endif
+ /* The force times 1/r */
+ gmx_mm_pr fscal_SSE0;
+ gmx_mm_pr fscal_SSE2;
+
+#ifdef CALC_LJ
+#ifdef LJ_COMB_LB
+ /* LJ sigma_j/2 and sqrt(epsilon_j) */
+ gmx_mm_pr hsig_j_SSE,seps_j_SSE;
+ /* LJ sigma_ij and epsilon_ij */
+ gmx_mm_pr sig_SSE0,eps_SSE0;
+#ifndef HALF_LJ
+ gmx_mm_pr sig_SSE2,eps_SSE2;
+#endif
+#ifdef CALC_ENERGIES
+ gmx_mm_pr sig2_SSE0,sig6_SSE0;
+#ifndef HALF_LJ
+ gmx_mm_pr sig2_SSE2,sig6_SSE2;
+#endif
+#endif /* LJ_COMB_LB */
+#endif /* CALC_LJ */
+
+#ifdef LJ_COMB_GEOM
+ gmx_mm_pr c6s_j_SSE,c12s_j_SSE;
+#endif
+
+#if defined LJ_COMB_GEOM || defined LJ_COMB_LB
+ /* Index for loading LJ parameters, complicated when interleaving */
+ int aj2;
+#endif
+
+#ifndef FIX_LJ_C
+ /* LJ C6 and C12 parameters, used with geometric comb. rule */
+ gmx_mm_pr c6_SSE0,c12_SSE0;
+#ifndef HALF_LJ
+ gmx_mm_pr c6_SSE2,c12_SSE2;
+#endif
+#endif
+
+ /* Intermediate variables for LJ calculation */
+#ifndef LJ_COMB_LB
+ gmx_mm_pr rinvsix_SSE0;
+#ifndef HALF_LJ
+ gmx_mm_pr rinvsix_SSE2;
+#endif
+#endif
+#ifdef LJ_COMB_LB
+ gmx_mm_pr sir_SSE0,sir2_SSE0,sir6_SSE0;
+#ifndef HALF_LJ
+ gmx_mm_pr sir_SSE2,sir2_SSE2,sir6_SSE2;
+#endif
+#endif
+
+ gmx_mm_pr FrLJ6_SSE0,FrLJ12_SSE0;
+#ifndef HALF_LJ
+ gmx_mm_pr FrLJ6_SSE2,FrLJ12_SSE2;
+#endif
+#ifdef CALC_ENERGIES
+ gmx_mm_pr VLJ6_SSE0,VLJ12_SSE0,VLJ_SSE0;
+#ifndef HALF_LJ
+ gmx_mm_pr VLJ6_SSE2,VLJ12_SSE2,VLJ_SSE2;
+#endif
+#endif
+#endif /* CALC_LJ */
+
+ /* j-cluster index */
+ cj = l_cj[cjind].cj;
+
+ /* Atom indices (of the first atom in the cluster) */
+ aj = cj*UNROLLJ;
+#if defined CALC_LJ && (defined LJ_COMB_GEOM || defined LJ_COMB_LB)
+#if UNROLLJ == STRIDE
+ aj2 = aj*2;
+#else
+ aj2 = (cj>>1)*2*STRIDE + (cj & 1)*UNROLLJ;
+#endif
+#endif
+#if UNROLLJ == STRIDE
+ ajx = aj*DIM;
+#else
+ ajx = (cj>>1)*DIM*STRIDE + (cj & 1)*UNROLLJ;
+#endif
+ ajy = ajx + STRIDE;
+ ajz = ajy + STRIDE;
+
+#ifdef CHECK_EXCLS
+ {
+ /* Load integer interaction mask */
+ /* With AVX there are no integer operations, so cast to real */
+ gmx_mm_pr mask_pr = gmx_mm_castsi256_pr(_mm256_set1_epi32(l_cj[cjind].excl));
+ /* Intel Compiler version 12.1.3 20120130 is buggy: use cast.
+ * With gcc we don't need the cast, but it's faster.
+ */
+#define cast_cvt(x) _mm256_cvtepi32_ps(_mm256_castps_si256(x))
+ int_SSE0 = gmx_cmpneq_pr(cast_cvt(gmx_and_pr(mask_pr,mask0)),zero_SSE);
+ int_SSE2 = gmx_cmpneq_pr(cast_cvt(gmx_and_pr(mask_pr,mask2)),zero_SSE);
+#undef cast_cvt
+ }
+#endif
+ /* load j atom coordinates */
+ jxSSE = gmx_loaddh_pr(x+ajx);
+ jySSE = gmx_loaddh_pr(x+ajy);
+ jzSSE = gmx_loaddh_pr(x+ajz);
+
+ /* Calculate distance */
+ dx_SSE0 = gmx_sub_pr(ix_SSE0,jxSSE);
+ dy_SSE0 = gmx_sub_pr(iy_SSE0,jySSE);
+ dz_SSE0 = gmx_sub_pr(iz_SSE0,jzSSE);
+ dx_SSE2 = gmx_sub_pr(ix_SSE2,jxSSE);
+ dy_SSE2 = gmx_sub_pr(iy_SSE2,jySSE);
+ dz_SSE2 = gmx_sub_pr(iz_SSE2,jzSSE);
+
+ /* rsq = dx*dx+dy*dy+dz*dz */
+ rsq_SSE0 = gmx_calc_rsq_pr(dx_SSE0,dy_SSE0,dz_SSE0);
+ rsq_SSE2 = gmx_calc_rsq_pr(dx_SSE2,dy_SSE2,dz_SSE2);
+
+#ifndef CUTOFF_BLENDV
+ wco_SSE0 = gmx_cmplt_pr(rsq_SSE0,rc2_SSE);
+ wco_SSE2 = gmx_cmplt_pr(rsq_SSE2,rc2_SSE);
+#endif
+
+#ifdef CHECK_EXCLS
+#ifdef EXCL_FORCES
+ /* Only remove the (sub-)diagonal to avoid double counting */
+#if UNROLLJ == UNROLLI
+ if (cj == ci_sh)
+ {
+ wco_SSE0 = gmx_and_pr(wco_SSE0,diag_SSE0);
+ wco_SSE2 = gmx_and_pr(wco_SSE2,diag_SSE2);
+ }
+#else
+#error "only UNROLLJ == UNROLLI currently supported in the joined kernels"
+#endif
+#else /* EXCL_FORCES */
+ /* Remove all excluded atom pairs from the list */
+ wco_SSE0 = gmx_and_pr(wco_SSE0,int_SSE0);
+ wco_SSE2 = gmx_and_pr(wco_SSE2,int_SSE2);
+#endif
+#endif
+
+#ifdef COUNT_PAIRS
+ {
+ int i,j;
+ real tmp[UNROLLJ];
+ for(i=0; i<UNROLLI; i++)
+ {
+ gmx_storeu_pr(tmp,i==0 ? wco_SSE0 : (i==1 ? wco_SSE1 : (i==2 ? wco_SSE2 : wco_SSE3)));
+ for(j=0; j<UNROLLJ; j++)
+ {
+ if (!(tmp[j] == 0))
+ {
+ npair++;
+ }
+ }
+ }
+ }
+#endif
+
+#ifdef CHECK_EXCLS
+ /* For excluded pairs add a small number to avoid r^-6 = NaN */
+ rsq_SSE0 = gmx_add_pr(rsq_SSE0,gmx_andnot_pr(int_SSE0,avoid_sing_SSE));
+ rsq_SSE2 = gmx_add_pr(rsq_SSE2,gmx_andnot_pr(int_SSE2,avoid_sing_SSE));
+#endif
+
+ /* Calculate 1/r */
+ rinv_SSE0 = gmx_invsqrt_pr(rsq_SSE0);
+ rinv_SSE2 = gmx_invsqrt_pr(rsq_SSE2);
+
+#ifdef CALC_COULOMB
+ /* Load parameters for j atom */
+ jq_SSE = gmx_loaddh_pr(q+aj);
+ qq_SSE0 = gmx_mul_pr(iq_SSE0,jq_SSE);
+ qq_SSE2 = gmx_mul_pr(iq_SSE2,jq_SSE);
+#endif
+
+#ifdef CALC_LJ
+
+#if !defined LJ_COMB_GEOM && !defined LJ_COMB_LB && !defined FIX_LJ_C
+ load_lj_pair_params2(nbfp0,type,aj,c6_SSE0,c12_SSE0);
+#ifndef HALF_LJ
+ load_lj_pair_params2(nbfp2,type,aj,c6_SSE2,c12_SSE2);
+#endif
+#endif /* not defined any LJ rule */
+
+#ifdef LJ_COMB_GEOM
+ c6s_j_SSE = gmx_loaddh_pr(ljc+aj2+0);
+ c12s_j_SSE = gmx_loaddh_pr(ljc+aj2+STRIDE);
+ c6_SSE0 = gmx_mul_pr(c6s_SSE0 ,c6s_j_SSE );
+#ifndef HALF_LJ
+ c6_SSE2 = gmx_mul_pr(c6s_SSE2 ,c6s_j_SSE );
+#endif
+ c12_SSE0 = gmx_mul_pr(c12s_SSE0,c12s_j_SSE);
+#ifndef HALF_LJ
+ c12_SSE2 = gmx_mul_pr(c12s_SSE2,c12s_j_SSE);
+#endif
+#endif /* LJ_COMB_GEOM */
+
+#ifdef LJ_COMB_LB
+ hsig_j_SSE = gmx_loaddh_pr(ljc+aj2+0);
+ seps_j_SSE = gmx_loaddh_pr(ljc+aj2+STRIDE);
+
+ sig_SSE0 = gmx_add_pr(hsig_i_SSE0,hsig_j_SSE);
+ eps_SSE0 = gmx_mul_pr(seps_i_SSE0,seps_j_SSE);
+#ifndef HALF_LJ
+ sig_SSE2 = gmx_add_pr(hsig_i_SSE2,hsig_j_SSE);
+ eps_SSE2 = gmx_mul_pr(seps_i_SSE2,seps_j_SSE);
+#endif
+#endif /* LJ_COMB_LB */
+
+#endif /* CALC_LJ */
+
+#ifndef CUTOFF_BLENDV
+ rinv_SSE0 = gmx_and_pr(rinv_SSE0,wco_SSE0);
+ rinv_SSE2 = gmx_and_pr(rinv_SSE2,wco_SSE2);
+#else
+ /* We only need to mask for the cut-off: blendv is faster */
+ rinv_SSE0 = gmx_blendv_pr(rinv_SSE0,zero_SSE,gmx_sub_pr(rc2_SSE,rsq_SSE0));
+ rinv_SSE2 = gmx_blendv_pr(rinv_SSE2,zero_SSE,gmx_sub_pr(rc2_SSE,rsq_SSE2));
+#endif
+
+ rinvsq_SSE0 = gmx_mul_pr(rinv_SSE0,rinv_SSE0);
+ rinvsq_SSE2 = gmx_mul_pr(rinv_SSE2,rinv_SSE2);
+
+#ifdef CALC_COULOMB
+ /* Note that here we calculate force*r, not the usual force/r.
+ * This allows avoiding masking the reaction-field contribution,
+ * as frcoul is later multiplied by rinvsq which has been
+ * masked with the cut-off check.
+ */
+
+#ifdef EXCL_FORCES
+ /* Only add 1/r for non-excluded atom pairs */
+ rinv_ex_SSE0 = gmx_and_pr(rinv_SSE0,int_SSE0);
+ rinv_ex_SSE2 = gmx_and_pr(rinv_SSE2,int_SSE2);
+#else
+ /* No exclusion forces, we always need 1/r */
+#define rinv_ex_SSE0 rinv_SSE0
+#define rinv_ex_SSE2 rinv_SSE2
+#endif
+
+#ifdef CALC_COUL_RF
+ /* Electrostatic interactions */
+ frcoul_SSE0 = gmx_mul_pr(qq_SSE0,gmx_add_pr(rinv_ex_SSE0,gmx_mul_pr(rsq_SSE0,mrc_3_SSE)));
+ frcoul_SSE2 = gmx_mul_pr(qq_SSE2,gmx_add_pr(rinv_ex_SSE2,gmx_mul_pr(rsq_SSE2,mrc_3_SSE)));
+
+#ifdef CALC_ENERGIES
+ vcoul_SSE0 = gmx_mul_pr(qq_SSE0,gmx_add_pr(rinv_ex_SSE0,gmx_add_pr(gmx_mul_pr(rsq_SSE0,hrc_3_SSE),moh_rc_SSE)));
+ vcoul_SSE2 = gmx_mul_pr(qq_SSE2,gmx_add_pr(rinv_ex_SSE2,gmx_add_pr(gmx_mul_pr(rsq_SSE2,hrc_3_SSE),moh_rc_SSE)));
+#endif
+#endif
+
+#ifdef CALC_COUL_EWALD
+ /* We need to mask (or limit) rsq for the cut-off,
+ * as large distances can cause an overflow in gmx_pmecorrF/V.
+ */
+#ifndef CUTOFF_BLENDV
+ brsq_SSE0 = gmx_mul_pr(beta2_SSE,gmx_and_pr(rsq_SSE0,wco_SSE0));
+ brsq_SSE2 = gmx_mul_pr(beta2_SSE,gmx_and_pr(rsq_SSE2,wco_SSE2));
+#else
+ /* Strangely, putting mul on a separate line is slower (icc 13) */
+ brsq_SSE0 = gmx_mul_pr(beta2_SSE,gmx_blendv_pr(rsq_SSE0,zero_SSE,gmx_sub_pr(rc2_SSE,rsq_SSE0)));
+ brsq_SSE2 = gmx_mul_pr(beta2_SSE,gmx_blendv_pr(rsq_SSE2,zero_SSE,gmx_sub_pr(rc2_SSE,rsq_SSE2)));
+#endif
+ ewcorr_SSE0 = gmx_mul_pr(gmx_pmecorrF_pr(brsq_SSE0),beta_SSE);
+ ewcorr_SSE2 = gmx_mul_pr(gmx_pmecorrF_pr(brsq_SSE2),beta_SSE);
+ frcoul_SSE0 = gmx_mul_pr(qq_SSE0,gmx_add_pr(rinv_ex_SSE0,gmx_mul_pr(ewcorr_SSE0,brsq_SSE0)));
+ frcoul_SSE2 = gmx_mul_pr(qq_SSE2,gmx_add_pr(rinv_ex_SSE2,gmx_mul_pr(ewcorr_SSE2,brsq_SSE2)));
+
+#ifdef CALC_ENERGIES
+ vc_sub_SSE0 = gmx_mul_pr(gmx_pmecorrV_pr(brsq_SSE0),beta_SSE);
+ vc_sub_SSE2 = gmx_mul_pr(gmx_pmecorrV_pr(brsq_SSE2),beta_SSE);
+#endif
+
+#endif /* CALC_COUL_EWALD */
+
+#ifdef CALC_COUL_TAB
+ /* Electrostatic interactions */
+ r_SSE0 = gmx_mul_pr(rsq_SSE0,rinv_SSE0);
+ r_SSE2 = gmx_mul_pr(rsq_SSE2,rinv_SSE2);
+ /* Convert r to scaled table units */
+ rs_SSE0 = gmx_mul_pr(r_SSE0,invtsp_SSE);
+ rs_SSE2 = gmx_mul_pr(r_SSE2,invtsp_SSE);
+ /* Truncate scaled r to an int */
+ ti_SSE0 = gmx_cvttpr_epi32(rs_SSE0);
+ ti_SSE2 = gmx_cvttpr_epi32(rs_SSE2);
+#ifdef GMX_X86_SSE4_1
+ /* SSE4.1 floor is faster than gmx_cvtepi32_ps int->float cast */
+ rf_SSE0 = gmx_floor_pr(rs_SSE0);
+ rf_SSE2 = gmx_floor_pr(rs_SSE2);
+#else
+ rf_SSE0 = gmx_cvtepi32_pr(ti_SSE0);
+ rf_SSE2 = gmx_cvtepi32_pr(ti_SSE2);
+#endif
+ frac_SSE0 = gmx_sub_pr(rs_SSE0,rf_SSE0);
+ frac_SSE2 = gmx_sub_pr(rs_SSE2,rf_SSE2);
+
+ /* Load and interpolate table forces and possibly energies.
+ * Force and energy can be combined in one table, stride 4: FDV0
+ * or in two separate tables with stride 1: F and V
+ * Currently single precision uses FDV0, double F and V.
+ */
+#ifndef CALC_ENERGIES
+ load_table_f(tab_coul_F,ti_SSE0,ti0,ctab0_SSE0,ctab1_SSE0);
+ load_table_f(tab_coul_F,ti_SSE2,ti2,ctab0_SSE2,ctab1_SSE2);
+#else
+#ifdef TAB_FDV0
+ load_table_f_v(tab_coul_F,ti_SSE0,ti0,ctab0_SSE0,ctab1_SSE0,ctabv_SSE0);
+ load_table_f_v(tab_coul_F,ti_SSE2,ti2,ctab0_SSE2,ctab1_SSE2,ctabv_SSE2);
+#else
+ load_table_f_v(tab_coul_F,tab_coul_V,ti_SSE0,ti0,ctab0_SSE0,ctab1_SSE0,ctabv_SSE0);
+ load_table_f_v(tab_coul_F,tab_coul_V,ti_SSE2,ti2,ctab0_SSE2,ctab1_SSE2,ctabv_SSE2);
+#endif
+#endif
+ fsub_SSE0 = gmx_add_pr(ctab0_SSE0,gmx_mul_pr(frac_SSE0,ctab1_SSE0));
+ fsub_SSE2 = gmx_add_pr(ctab0_SSE2,gmx_mul_pr(frac_SSE2,ctab1_SSE2));
+ frcoul_SSE0 = gmx_mul_pr(qq_SSE0,gmx_sub_pr(rinv_ex_SSE0,gmx_mul_pr(fsub_SSE0,r_SSE0)));
+ frcoul_SSE2 = gmx_mul_pr(qq_SSE2,gmx_sub_pr(rinv_ex_SSE2,gmx_mul_pr(fsub_SSE2,r_SSE2)));
+
+#ifdef CALC_ENERGIES
+ vc_sub_SSE0 = gmx_add_pr(ctabv_SSE0,gmx_mul_pr(gmx_mul_pr(mhalfsp_SSE,frac_SSE0),gmx_add_pr(ctab0_SSE0,fsub_SSE0)));
+ vc_sub_SSE2 = gmx_add_pr(ctabv_SSE2,gmx_mul_pr(gmx_mul_pr(mhalfsp_SSE,frac_SSE2),gmx_add_pr(ctab0_SSE2,fsub_SSE2)));
+#endif
+#endif /* CALC_COUL_TAB */
+
+#if defined CALC_ENERGIES && (defined CALC_COUL_EWALD || defined CALC_COUL_TAB)
+#ifndef NO_SHIFT_EWALD
+ /* Add Ewald potential shift to vc_sub for convenience */
+#ifdef CHECK_EXCLS
+ vc_sub_SSE0 = gmx_add_pr(vc_sub_SSE0,gmx_and_pr(sh_ewald_SSE,int_SSE0));
+ vc_sub_SSE2 = gmx_add_pr(vc_sub_SSE2,gmx_and_pr(sh_ewald_SSE,int_SSE2));
+#else
+ vc_sub_SSE0 = gmx_add_pr(vc_sub_SSE0,sh_ewald_SSE);
+ vc_sub_SSE2 = gmx_add_pr(vc_sub_SSE2,sh_ewald_SSE);
+#endif
+#endif
+
+ vcoul_SSE0 = gmx_mul_pr(qq_SSE0,gmx_sub_pr(rinv_ex_SSE0,vc_sub_SSE0));
+ vcoul_SSE2 = gmx_mul_pr(qq_SSE2,gmx_sub_pr(rinv_ex_SSE2,vc_sub_SSE2));
+#endif
+
+#ifdef CALC_ENERGIES
+ /* Mask energy for cut-off and diagonal */
+ vcoul_SSE0 = gmx_and_pr(vcoul_SSE0,wco_SSE0);
+ vcoul_SSE2 = gmx_and_pr(vcoul_SSE2,wco_SSE2);
+#endif
+
+#endif /* CALC_COULOMB */
+
+#ifdef CALC_LJ
+ /* Lennard-Jones interaction */
+
+#ifdef VDW_CUTOFF_CHECK
+ wco_vdw_SSE0 = gmx_cmplt_pr(rsq_SSE0,rcvdw2_SSE);
+#ifndef HALF_LJ
+ wco_vdw_SSE2 = gmx_cmplt_pr(rsq_SSE2,rcvdw2_SSE);
+#endif
+#else
+ /* Same cut-off for Coulomb and VdW, reuse the registers */
+#define wco_vdw_SSE0 wco_SSE0
+#define wco_vdw_SSE2 wco_SSE2
+#endif
+
+#ifndef LJ_COMB_LB
+ rinvsix_SSE0 = gmx_mul_pr(rinvsq_SSE0,gmx_mul_pr(rinvsq_SSE0,rinvsq_SSE0));
+#ifdef EXCL_FORCES
+ rinvsix_SSE0 = gmx_and_pr(rinvsix_SSE0,int_SSE0);
+#endif
+#ifndef HALF_LJ
+ rinvsix_SSE2 = gmx_mul_pr(rinvsq_SSE2,gmx_mul_pr(rinvsq_SSE2,rinvsq_SSE2));
+#ifdef EXCL_FORCES
+ rinvsix_SSE2 = gmx_and_pr(rinvsix_SSE2,int_SSE2);
+#endif
+#endif
+#ifdef VDW_CUTOFF_CHECK
+ rinvsix_SSE0 = gmx_and_pr(rinvsix_SSE0,wco_vdw_SSE0);
+#ifndef HALF_LJ
+ rinvsix_SSE2 = gmx_and_pr(rinvsix_SSE2,wco_vdw_SSE2);
+#endif
+#endif
+ FrLJ6_SSE0 = gmx_mul_pr(c6_SSE0,rinvsix_SSE0);
+#ifndef HALF_LJ
+ FrLJ6_SSE2 = gmx_mul_pr(c6_SSE2,rinvsix_SSE2);
+#endif
+ FrLJ12_SSE0 = gmx_mul_pr(c12_SSE0,gmx_mul_pr(rinvsix_SSE0,rinvsix_SSE0));
+#ifndef HALF_LJ
+ FrLJ12_SSE2 = gmx_mul_pr(c12_SSE2,gmx_mul_pr(rinvsix_SSE2,rinvsix_SSE2));
+#endif
+#endif /* not LJ_COMB_LB */
+
+#ifdef LJ_COMB_LB
+ sir_SSE0 = gmx_mul_pr(sig_SSE0,rinv_SSE0);
+#ifndef HALF_LJ
+ sir_SSE2 = gmx_mul_pr(sig_SSE2,rinv_SSE2);
+#endif
+ sir2_SSE0 = gmx_mul_pr(sir_SSE0,sir_SSE0);
+#ifndef HALF_LJ
+ sir2_SSE2 = gmx_mul_pr(sir_SSE2,sir_SSE2);
+#endif
+ sir6_SSE0 = gmx_mul_pr(sir2_SSE0,gmx_mul_pr(sir2_SSE0,sir2_SSE0));
+#ifdef EXCL_FORCES
+ sir6_SSE0 = gmx_and_pr(sir6_SSE0,int_SSE0);
+#endif
+#ifndef HALF_LJ
+ sir6_SSE2 = gmx_mul_pr(sir2_SSE2,gmx_mul_pr(sir2_SSE2,sir2_SSE2));
+#ifdef EXCL_FORCES
+ sir6_SSE2 = gmx_and_pr(sir6_SSE2,int_SSE2);
+#endif
+#endif
+#ifdef VDW_CUTOFF_CHECK
+ sir6_SSE0 = gmx_and_pr(sir6_SSE0,wco_vdw_SSE0);
+#ifndef HALF_LJ
+ sir6_SSE2 = gmx_and_pr(sir6_SSE2,wco_vdw_SSE2);
+#endif
+#endif
+ FrLJ6_SSE0 = gmx_mul_pr(eps_SSE0,sir6_SSE0);
+#ifndef HALF_LJ
+ FrLJ6_SSE2 = gmx_mul_pr(eps_SSE2,sir6_SSE2);
+#endif
+ FrLJ12_SSE0 = gmx_mul_pr(FrLJ6_SSE0,sir6_SSE0);
+#ifndef HALF_LJ
+ FrLJ12_SSE2 = gmx_mul_pr(FrLJ6_SSE2,sir6_SSE2);
+#endif
+#if defined CALC_ENERGIES
+ /* We need C6 and C12 to calculate the LJ potential shift */
+ sig2_SSE0 = gmx_mul_pr(sig_SSE0,sig_SSE0);
+#ifndef HALF_LJ
+ sig2_SSE2 = gmx_mul_pr(sig_SSE2,sig_SSE2);
+#endif
+ sig6_SSE0 = gmx_mul_pr(sig2_SSE0,gmx_mul_pr(sig2_SSE0,sig2_SSE0));
+#ifndef HALF_LJ
+ sig6_SSE2 = gmx_mul_pr(sig2_SSE2,gmx_mul_pr(sig2_SSE2,sig2_SSE2));
+#endif
+ c6_SSE0 = gmx_mul_pr(eps_SSE0,sig6_SSE0);
+#ifndef HALF_LJ
+ c6_SSE2 = gmx_mul_pr(eps_SSE2,sig6_SSE2);
+#endif
+ c12_SSE0 = gmx_mul_pr(c6_SSE0,sig6_SSE0);
+#ifndef HALF_LJ
+ c12_SSE2 = gmx_mul_pr(c6_SSE2,sig6_SSE2);
+#endif
+#endif
+#endif /* LJ_COMB_LB */
+
+#endif /* CALC_LJ */
+
+#ifdef CALC_ENERGIES
+#ifdef ENERGY_GROUPS
+ /* Extract the group pair index per j pair.
+ * Energy groups are stored per i-cluster, so things get
+ * complicated when the i- and j-cluster size don't match.
+ */
+ {
+ int egps_j;
+#if UNROLLJ == 2
+ egps_j = nbat->energrp[cj>>1];
+ egp_jj[0] = ((egps_j >> ((cj & 1)*egps_jshift)) & egps_jmask)*egps_jstride;
+#else
+ /* We assume UNROLLI <= UNROLLJ */
+ int jdi;
+ for(jdi=0; jdi<UNROLLJ/UNROLLI; jdi++)
+ {
+ int jj;
+ egps_j = nbat->energrp[cj*(UNROLLJ/UNROLLI)+jdi];
+ for(jj=0; jj<(UNROLLI/2); jj++)
+ {
+ egp_jj[jdi*(UNROLLI/2)+jj] = ((egps_j >> (jj*egps_jshift)) & egps_jmask)*egps_jstride;
+ }
+ }
+#endif
+ }
+#endif
+
+#ifdef CALC_COULOMB
+#ifndef ENERGY_GROUPS
+ vctotSSE = gmx_add_pr(vctotSSE, gmx_add_pr(vcoul_SSE0,vcoul_SSE2));
+#else
+ add_ener_grp_halves(vcoul_SSE0,vctp[0],vctp[1],egp_jj);
+ add_ener_grp_halves(vcoul_SSE2,vctp[2],vctp[3],egp_jj);
+#endif
+#endif
+
+#ifdef CALC_LJ
+ /* Calculate the LJ energies */
+ VLJ6_SSE0 = gmx_mul_pr(sixthSSE,gmx_sub_pr(FrLJ6_SSE0,gmx_mul_pr(c6_SSE0,sh_invrc6_SSE)));
+#ifndef HALF_LJ
+ VLJ6_SSE2 = gmx_mul_pr(sixthSSE,gmx_sub_pr(FrLJ6_SSE2,gmx_mul_pr(c6_SSE2,sh_invrc6_SSE)));
+#endif
+ VLJ12_SSE0 = gmx_mul_pr(twelvethSSE,gmx_sub_pr(FrLJ12_SSE0,gmx_mul_pr(c12_SSE0,sh_invrc12_SSE)));
+#ifndef HALF_LJ
+ VLJ12_SSE2 = gmx_mul_pr(twelvethSSE,gmx_sub_pr(FrLJ12_SSE2,gmx_mul_pr(c12_SSE2,sh_invrc12_SSE)));
+#endif
+
+ VLJ_SSE0 = gmx_sub_pr(VLJ12_SSE0,VLJ6_SSE0);
+#ifndef HALF_LJ
+ VLJ_SSE2 = gmx_sub_pr(VLJ12_SSE2,VLJ6_SSE2);
+#endif
+ /* The potential shift should be removed for pairs beyond cut-off */
+ VLJ_SSE0 = gmx_and_pr(VLJ_SSE0,wco_vdw_SSE0);
+#ifndef HALF_LJ
+ VLJ_SSE2 = gmx_and_pr(VLJ_SSE2,wco_vdw_SSE2);
+#endif
+#ifdef CHECK_EXCLS
+ /* The potential shift should be removed for excluded pairs */
+ VLJ_SSE0 = gmx_and_pr(VLJ_SSE0,int_SSE0);
+#ifndef HALF_LJ
+ VLJ_SSE2 = gmx_and_pr(VLJ_SSE2,int_SSE2);
+#endif
+#endif
+#ifndef ENERGY_GROUPS
+ VvdwtotSSE = gmx_add_pr(VvdwtotSSE,
+#ifndef HALF_LJ
+ gmx_add_pr(VLJ_SSE0,VLJ_SSE2)
+#else
+ VLJ_SSE0
+#endif
+ );
+#else
+ add_ener_grp_halves(VLJ_SSE0,vvdwtp[0],vvdwtp[1],egp_jj);
+#ifndef HALF_LJ
+ add_ener_grp_halves(VLJ_SSE2,vvdwtp[2],vvdwtp[3],egp_jj);
+#endif
+#endif
+#endif /* CALC_LJ */
+#endif /* CALC_ENERGIES */
+
+#ifdef CALC_LJ
+ fscal_SSE0 = gmx_mul_pr(rinvsq_SSE0,
+#ifdef CALC_COULOMB
+ gmx_add_pr(frcoul_SSE0,
+#else
+ (
+#endif
+ gmx_sub_pr(FrLJ12_SSE0,FrLJ6_SSE0)));
+#else
+ fscal_SSE0 = gmx_mul_pr(rinvsq_SSE0,frcoul_SSE0);
+#endif /* CALC_LJ */
+#if defined CALC_LJ && !defined HALF_LJ
+ fscal_SSE2 = gmx_mul_pr(rinvsq_SSE2,
+#ifdef CALC_COULOMB
+ gmx_add_pr(frcoul_SSE2,
+#else
+ (
+#endif
+ gmx_sub_pr(FrLJ12_SSE2,FrLJ6_SSE2)));
+#else
+ /* Atom 2 and 3 don't have LJ, so only add Coulomb forces */
+ fscal_SSE2 = gmx_mul_pr(rinvsq_SSE2,frcoul_SSE2);
+#endif
+
+ /* Calculate temporary vectorial force */
+ tx_SSE0 = gmx_mul_pr(fscal_SSE0,dx_SSE0);
+ tx_SSE2 = gmx_mul_pr(fscal_SSE2,dx_SSE2);
+ ty_SSE0 = gmx_mul_pr(fscal_SSE0,dy_SSE0);
+ ty_SSE2 = gmx_mul_pr(fscal_SSE2,dy_SSE2);
+ tz_SSE0 = gmx_mul_pr(fscal_SSE0,dz_SSE0);
+ tz_SSE2 = gmx_mul_pr(fscal_SSE2,dz_SSE2);
+
+ /* Increment i atom force */
+ fix_SSE0 = gmx_add_pr(fix_SSE0,tx_SSE0);
+ fix_SSE2 = gmx_add_pr(fix_SSE2,tx_SSE2);
+ fiy_SSE0 = gmx_add_pr(fiy_SSE0,ty_SSE0);
+ fiy_SSE2 = gmx_add_pr(fiy_SSE2,ty_SSE2);
+ fiz_SSE0 = gmx_add_pr(fiz_SSE0,tz_SSE0);
+ fiz_SSE2 = gmx_add_pr(fiz_SSE2,tz_SSE2);
+
+ /* Decrement j atom force */
+ gmx_store_hpr(f+ajx,
+ gmx_sub_hpr( gmx_load_hpr(f+ajx), gmx_sum4_hpr(tx_SSE0,tx_SSE2) ));
+ gmx_store_hpr(f+ajy,
+ gmx_sub_hpr( gmx_load_hpr(f+ajy), gmx_sum4_hpr(ty_SSE0,ty_SSE2) ));
+ gmx_store_hpr(f+ajz,
+ gmx_sub_hpr( gmx_load_hpr(f+ajz), gmx_sum4_hpr(tz_SSE0,tz_SSE2) ));
+ }
+
+#undef rinv_ex_SSE0
+#undef rinv_ex_SSE2
+
+#undef wco_vdw_SSE0
+#undef wco_vdw_SSE2
+
+#undef CUTOFF_BLENDV
+
+#undef EXCL_FORCES
--- /dev/null
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2009, The GROMACS Development Team
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+/* GMX_MM256_HERE should be set before including this file */
+#include "gmx_simd_macros.h"
+
+#define SUM_SIMD4(x) (x[0]+x[1]+x[2]+x[3])
+
+#define UNROLLI NBNXN_CPU_CLUSTER_I_SIZE
+#define UNROLLJ (GMX_SIMD_WIDTH_HERE/2)
+
+#if defined GMX_MM256_HERE
+#define STRIDE 4
+#endif
+
+#ifdef GMX_MM256_HERE
+#ifndef GMX_DOUBLE
+/* single precision 2x(4+4) kernel */
+#define SUM_SIMD(x) (x[0]+x[1]+x[2]+x[3]+x[4]+x[5]+x[6]+x[7])
+#define TAB_FDV0
+#else
+#error "unsupported kernel configuration"
+#endif
+#endif
+
+#define SIMD_MASK_ALL 0xffffffff
+
+#include "nbnxn_kernel_simd_utils.h"
+
+/* All functionality defines are set here, except for:
+ * CALC_ENERGIES, ENERGY_GROUPS which are defined before.
+ * CHECK_EXCLS, which is set just before including the inner loop contents.
+ * The combination rule defines, LJ_COMB_GEOM or LJ_COMB_LB are currently
+ * set before calling the kernel function. We might want to move that
+ * to inside the n-loop and have a different combination rule for different
+ * ci's, as no combination rule gives a 50% performance hit for LJ.
+ */
+
+/* We always calculate shift forces, because it's cheap anyhow */
+#define CALC_SHIFTFORCES
+
+/* Assumes all LJ parameters are identical */
+/* #define FIX_LJ_C */
+
+/* The NBK_FUNC_NAME... macros below generate the whole zoo of kernels names
+ * with all combinations off electrostatics (coul), LJ combination rules (ljc)
+ * and energy calculations (ene), depending on the defines set.
+ */
+
+#define NBK_FUNC_NAME_C_LJC(base,coul,ljc,ene) base##_##coul##_comb_##ljc##_##ene
+
+#if defined LJ_COMB_GEOM
+#define NBK_FUNC_NAME_C(base,coul,ene) NBK_FUNC_NAME_C_LJC(base,coul,geom,ene)
+#else
+#if defined LJ_COMB_LB
+#define NBK_FUNC_NAME_C(base,coul,ene) NBK_FUNC_NAME_C_LJC(base,coul,lb,ene)
+#else
+#define NBK_FUNC_NAME_C(base,coul,ene) NBK_FUNC_NAME_C_LJC(base,coul,none,ene)
+#endif
+#endif
+
+#ifdef CALC_COUL_RF
+#define NBK_FUNC_NAME(base,ene) NBK_FUNC_NAME_C(base,rf,ene)
+#endif
+#ifdef CALC_COUL_TAB
+#ifndef VDW_CUTOFF_CHECK
+#define NBK_FUNC_NAME(base,ene) NBK_FUNC_NAME_C(base,tab,ene)
+#else
+#define NBK_FUNC_NAME(base,ene) NBK_FUNC_NAME_C(base,tab_twin,ene)
+#endif
+#endif
+#ifdef CALC_COUL_EWALD
+#ifndef VDW_CUTOFF_CHECK
+#define NBK_FUNC_NAME(base,ene) NBK_FUNC_NAME_C(base,ewald,ene)
+#else
+#define NBK_FUNC_NAME(base,ene) NBK_FUNC_NAME_C(base,ewald_twin,ene)
+#endif
+#endif
+
+static void
+#ifndef CALC_ENERGIES
+NBK_FUNC_NAME(nbnxn_kernel_simd_2xnn,noener)
+#else
+#ifndef ENERGY_GROUPS
+NBK_FUNC_NAME(nbnxn_kernel_simd_2xnn,ener)
+#else
+NBK_FUNC_NAME(nbnxn_kernel_simd_2xnn,energrp)
+#endif
+#endif
+#undef NBK_FUNC_NAME
+#undef NBK_FUNC_NAME_C
+#undef NBK_FUNC_NAME_C_LJC
+ (const nbnxn_pairlist_t *nbl,
+ const nbnxn_atomdata_t *nbat,
+ const interaction_const_t *ic,
+ rvec *shift_vec,
+ real *f
+#ifdef CALC_SHIFTFORCES
+ ,
+ real *fshift
+#endif
+#ifdef CALC_ENERGIES
+ ,
+ real *Vvdw,
+ real *Vc
+#endif
+ )
+{
+ const nbnxn_ci_t *nbln;
+ const nbnxn_cj_t *l_cj;
+ const int *type;
+ const real *q;
+ const real *shiftvec;
+ const real *x;
+ const real *nbfp0,*nbfp1,*nbfp2=NULL,*nbfp3=NULL;
+ real facel;
+ real *nbfp_ptr;
+ int nbfp_stride;
+ int n,ci,ci_sh;
+ int ish,ish3;
+ gmx_bool do_LJ,half_LJ,do_coul;
+ int sci,scix,sciy,sciz,sci2;
+ int cjind0,cjind1,cjind;
+ int ip,jp;
+
+#ifdef ENERGY_GROUPS
+ int Vstride_i;
+ int egps_ishift,egps_imask;
+ int egps_jshift,egps_jmask,egps_jstride;
+ int egps_i;
+ real *vvdwtp[UNROLLI];
+ real *vctp[UNROLLI];
+#endif
+
+ gmx_mm_pr shX_SSE;
+ gmx_mm_pr shY_SSE;
+ gmx_mm_pr shZ_SSE;
+ gmx_mm_pr ix_SSE0,iy_SSE0,iz_SSE0;
+ gmx_mm_pr ix_SSE2,iy_SSE2,iz_SSE2;
+ gmx_mm_pr fix_SSE0,fiy_SSE0,fiz_SSE0;
+ gmx_mm_pr fix_SSE2,fiy_SSE2,fiz_SSE2;
+#if UNROLLJ >= 4
+#ifndef GMX_DOUBLE
+ __m128 fix_SSE,fiy_SSE,fiz_SSE;
+#else
+ __m256d fix_SSE,fiy_SSE,fiz_SSE;
+#endif
+#else
+ __m128d fix0_SSE,fiy0_SSE,fiz0_SSE;
+ __m128d fix2_SSE,fiy2_SSE,fiz2_SSE;
+#endif
+
+ /* AVX: use floating point masks, as there are no integer instructions */
+ gmx_mm_pr mask0 = _mm256_castsi256_ps(_mm256_set_epi32( 0x0080, 0x0040, 0x0020, 0x0010, 0x0008, 0x0004, 0x0002, 0x0001 ));
+ gmx_mm_pr mask2 = _mm256_castsi256_ps(_mm256_set_epi32( 0x8000, 0x4000, 0x2000, 0x1000, 0x0800, 0x0400, 0x0200, 0x0100 ));
+
+ gmx_mm_pr diag_SSE0 = _mm256_castsi256_ps( _mm256_set_epi32( 0xffffffff, 0xffffffff, 0x00000000, 0x00000000, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000 ));
+ gmx_mm_pr diag_SSE2 = _mm256_castsi256_ps( _mm256_set_epi32( 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0xffffffff, 0x00000000, 0x00000000, 0x00000000 ));
+
+#ifdef GMX_X86_SSE4_1
+ gmx_mm_pr zero_SSE = gmx_set1_pr(0);
+#endif
+
+ gmx_mm_pr one_SSE=gmx_set1_pr(1.0);
+ gmx_mm_pr iq_SSE0=gmx_setzero_pr();
+ gmx_mm_pr iq_SSE2=gmx_setzero_pr();
+ gmx_mm_pr mrc_3_SSE;
+#ifdef CALC_ENERGIES
+ gmx_mm_pr hrc_3_SSE,moh_rc_SSE;
+#endif
+
+#ifdef CALC_COUL_TAB
+ /* Coulomb table variables */
+ gmx_mm_pr invtsp_SSE;
+ const real *tab_coul_F;
+#ifndef TAB_FDV0
+ const real *tab_coul_V;
+#endif
+#ifdef GMX_MM256_HERE
+ int ti0_array[2*UNROLLJ-1],*ti0;
+ int ti2_array[2*UNROLLJ-1],*ti2;
+#endif
+#ifdef CALC_ENERGIES
+ gmx_mm_pr mhalfsp_SSE;
+#endif
+#endif
+
+#ifdef CALC_COUL_EWALD
+ gmx_mm_pr beta2_SSE,beta_SSE;
+#endif
+
+#if defined CALC_ENERGIES && (defined CALC_COUL_EWALD || defined CALC_COUL_TAB)
+ gmx_mm_pr sh_ewald_SSE;
+#endif
+
+#ifdef LJ_COMB_LB
+ const real *ljc;
+
+ gmx_mm_pr hsig_i_SSE0,seps_i_SSE0;
+ gmx_mm_pr hsig_i_SSE2,seps_i_SSE2;
+#else
+#ifdef FIX_LJ_C
+ real pvdw_array[2*UNROLLI*UNROLLJ+3];
+ real *pvdw_c6,*pvdw_c12;
+ gmx_mm_pr c6_SSE0,c12_SSE0;
+ gmx_mm_pr c6_SSE2,c12_SSE2;
+#endif
+
+#ifdef LJ_COMB_GEOM
+ const real *ljc;
+
+ gmx_mm_pr c6s_SSE0,c12s_SSE0;
+ gmx_mm_pr c6s_SSE1,c12s_SSE1;
+ gmx_mm_pr c6s_SSE2=gmx_setzero_pr(),c12s_SSE2=gmx_setzero_pr();
+ gmx_mm_pr c6s_SSE3=gmx_setzero_pr(),c12s_SSE3=gmx_setzero_pr();
+#endif
+#endif /* LJ_COMB_LB */
+
+ gmx_mm_pr vctotSSE,VvdwtotSSE;
+ gmx_mm_pr sixthSSE,twelvethSSE;
+
+ gmx_mm_pr avoid_sing_SSE;
+ gmx_mm_pr rc2_SSE;
+#ifdef VDW_CUTOFF_CHECK
+ gmx_mm_pr rcvdw2_SSE;
+#endif
+
+#ifdef CALC_ENERGIES
+ gmx_mm_pr sh_invrc6_SSE,sh_invrc12_SSE;
+
+ /* cppcheck-suppress unassignedVariable */
+ real tmpsum_array[15],*tmpsum;
+#endif
+#ifdef CALC_SHIFTFORCES
+ /* cppcheck-suppress unassignedVariable */
+ real shf_array[15],*shf;
+#endif
+
+ int ninner;
+
+#ifdef COUNT_PAIRS
+ int npair=0;
+#endif
+
+#if defined LJ_COMB_GEOM || defined LJ_COMB_LB
+ ljc = nbat->lj_comb;
+#else
+ /* No combination rule used */
+#ifndef GMX_DOUBLE
+ nbfp_ptr = nbat->nbfp_s4;
+#define NBFP_STRIDE 4
+#else
+ nbfp_ptr = nbat->nbfp;
+#define NBFP_STRIDE 2
+#endif
+ nbfp_stride = NBFP_STRIDE;
+#endif
+
+#ifdef CALC_COUL_TAB
+#ifdef GMX_MM256_HERE
+ /* Generate aligned table pointers */
+ ti0 = (int *)(((size_t)(ti0_array+UNROLLJ-1)) & (~((size_t)(UNROLLJ*sizeof(real)-1))));
+ ti2 = (int *)(((size_t)(ti2_array+UNROLLJ-1)) & (~((size_t)(UNROLLJ*sizeof(real)-1))));
+#endif
+
+ invtsp_SSE = gmx_set1_pr(ic->tabq_scale);
+#ifdef CALC_ENERGIES
+ mhalfsp_SSE = gmx_set1_pr(-0.5/ic->tabq_scale);
+#endif
+
+#ifdef TAB_FDV0
+ tab_coul_F = ic->tabq_coul_FDV0;
+#else
+ tab_coul_F = ic->tabq_coul_F;
+ tab_coul_V = ic->tabq_coul_V;
+#endif
+#endif /* CALC_COUL_TAB */
+
+#ifdef CALC_COUL_EWALD
+ beta2_SSE = gmx_set1_pr(ic->ewaldcoeff*ic->ewaldcoeff);
+ beta_SSE = gmx_set1_pr(ic->ewaldcoeff);
+#endif
+
+#if (defined CALC_COUL_TAB || defined CALC_COUL_EWALD) && defined CALC_ENERGIES
+ sh_ewald_SSE = gmx_set1_pr(ic->sh_ewald);
+#endif
+
+ q = nbat->q;
+ type = nbat->type;
+ facel = ic->epsfac;
+ shiftvec = shift_vec[0];
+ x = nbat->x;
+
+ avoid_sing_SSE = gmx_set1_pr(NBNXN_AVOID_SING_R2_INC);
+
+ /* The kernel either supports rcoulomb = rvdw or rcoulomb >= rvdw */
+ rc2_SSE = gmx_set1_pr(ic->rcoulomb*ic->rcoulomb);
+#ifdef VDW_CUTOFF_CHECK
+ rcvdw2_SSE = gmx_set1_pr(ic->rvdw*ic->rvdw);
+#endif
+
+#ifdef CALC_ENERGIES
+ sixthSSE = gmx_set1_pr(1.0/6.0);
+ twelvethSSE = gmx_set1_pr(1.0/12.0);
+
+ sh_invrc6_SSE = gmx_set1_pr(ic->sh_invrc6);
+ sh_invrc12_SSE = gmx_set1_pr(ic->sh_invrc6*ic->sh_invrc6);
+#endif
+
+ mrc_3_SSE = gmx_set1_pr(-2*ic->k_rf);
+
+#ifdef CALC_ENERGIES
+ hrc_3_SSE = gmx_set1_pr(ic->k_rf);
+
+ moh_rc_SSE = gmx_set1_pr(-ic->c_rf);
+#endif
+
+#ifdef CALC_ENERGIES
+ tmpsum = (real *)(((size_t)(tmpsum_array+7)) & (~((size_t)31)));
+#endif
+#ifdef CALC_SHIFTFORCES
+ shf = (real *)(((size_t)(shf_array+7)) & (~((size_t)31)));
+#endif
+
+#ifdef FIX_LJ_C
+ pvdw_c6 = (real *)(((size_t)(pvdw_array+3)) & (~((size_t)15)));
+ pvdw_c12 = pvdw_c6 + UNROLLI*UNROLLJ;
+
+ for(jp=0; jp<UNROLLJ; jp++)
+ {
+ pvdw_c6 [0*UNROLLJ+jp] = nbat->nbfp[0*2];
+ pvdw_c6 [1*UNROLLJ+jp] = nbat->nbfp[0*2];
+ pvdw_c6 [2*UNROLLJ+jp] = nbat->nbfp[0*2];
+ pvdw_c6 [3*UNROLLJ+jp] = nbat->nbfp[0*2];
+
+ pvdw_c12[0*UNROLLJ+jp] = nbat->nbfp[0*2+1];
+ pvdw_c12[1*UNROLLJ+jp] = nbat->nbfp[0*2+1];
+ pvdw_c12[2*UNROLLJ+jp] = nbat->nbfp[0*2+1];
+ pvdw_c12[3*UNROLLJ+jp] = nbat->nbfp[0*2+1];
+ }
+ c6_SSE0 = gmx_load_pr(pvdw_c6 +0*UNROLLJ);
+ c6_SSE1 = gmx_load_pr(pvdw_c6 +1*UNROLLJ);
+ c6_SSE2 = gmx_load_pr(pvdw_c6 +2*UNROLLJ);
+ c6_SSE3 = gmx_load_pr(pvdw_c6 +3*UNROLLJ);
+
+ c12_SSE0 = gmx_load_pr(pvdw_c12+0*UNROLLJ);
+ c12_SSE1 = gmx_load_pr(pvdw_c12+1*UNROLLJ);
+ c12_SSE2 = gmx_load_pr(pvdw_c12+2*UNROLLJ);
+ c12_SSE3 = gmx_load_pr(pvdw_c12+3*UNROLLJ);
+#endif /* FIX_LJ_C */
+
+#ifdef ENERGY_GROUPS
+ egps_ishift = nbat->neg_2log;
+ egps_imask = (1<<egps_ishift) - 1;
+ egps_jshift = 2*nbat->neg_2log;
+ egps_jmask = (1<<egps_jshift) - 1;
+ egps_jstride = (UNROLLJ>>1)*UNROLLJ;
+ /* Major division is over i-particle energy groups, determine the stride */
+ Vstride_i = nbat->nenergrp*(1<<nbat->neg_2log)*egps_jstride;
+#endif
+
+ l_cj = nbl->cj;
+
+ ninner = 0;
+ for(n=0; n<nbl->nci; n++)
+ {
+ nbln = &nbl->ci[n];
+
+ ish = (nbln->shift & NBNXN_CI_SHIFT);
+ ish3 = ish*3;
+ cjind0 = nbln->cj_ind_start;
+ cjind1 = nbln->cj_ind_end;
+ ci = nbln->ci;
+ ci_sh = (ish == CENTRAL ? ci : -1);
+
+ shX_SSE = gmx_load1_pr(shiftvec+ish3);
+ shY_SSE = gmx_load1_pr(shiftvec+ish3+1);
+ shZ_SSE = gmx_load1_pr(shiftvec+ish3+2);
+
+#if UNROLLJ <= 4
+ sci = ci*STRIDE;
+ scix = sci*DIM;
+ sci2 = sci*2;
+#else
+ sci = (ci>>1)*STRIDE;
+ scix = sci*DIM + (ci & 1)*(STRIDE>>1);
+ sci2 = sci*2 + (ci & 1)*(STRIDE>>1);
+ sci += (ci & 1)*(STRIDE>>1);
+#endif
+
+ /* We have 5 LJ/C combinations, but use only three inner loops,
+ * as the other combinations are unlikely and/or not much faster:
+ * inner half-LJ + C for half-LJ + C / no-LJ + C
+ * inner LJ + C for full-LJ + C
+ * inner LJ for full-LJ + no-C / half-LJ + no-C
+ */
+ do_LJ = (nbln->shift & NBNXN_CI_DO_LJ(0));
+ do_coul = (nbln->shift & NBNXN_CI_DO_COUL(0));
+ half_LJ = ((nbln->shift & NBNXN_CI_HALF_LJ(0)) || !do_LJ) && do_coul;
+
+#ifdef ENERGY_GROUPS
+ egps_i = nbat->energrp[ci];
+ {
+ int ia,egp_ia;
+
+ for(ia=0; ia<UNROLLI; ia++)
+ {
+ egp_ia = (egps_i >> (ia*egps_ishift)) & egps_imask;
+ vvdwtp[ia] = Vvdw + egp_ia*Vstride_i;
+ vctp[ia] = Vc + egp_ia*Vstride_i;
+ }
+ }
+#endif
+#if defined CALC_ENERGIES
+#if UNROLLJ == 4
+ if (do_coul && l_cj[nbln->cj_ind_start].cj == ci_sh)
+#endif
+#if UNROLLJ == 2
+ if (do_coul && l_cj[nbln->cj_ind_start].cj == (ci_sh<<1))
+#endif
+#if UNROLLJ == 8
+ if (do_coul && l_cj[nbln->cj_ind_start].cj == (ci_sh>>1))
+#endif
+ {
+ int ia;
+ real Vc_sub_self;
+
+#ifdef CALC_COUL_RF
+ Vc_sub_self = 0.5*ic->c_rf;
+#endif
+#ifdef CALC_COUL_TAB
+#ifdef TAB_FDV0
+ Vc_sub_self = 0.5*tab_coul_F[2];
+#else
+ Vc_sub_self = 0.5*tab_coul_V[0];
+#endif
+#endif
+#ifdef CALC_COUL_EWALD
+ /* beta/sqrt(pi) */
+ Vc_sub_self = 0.5*ic->ewaldcoeff*M_2_SQRTPI;
+#endif
+
+ for(ia=0; ia<UNROLLI; ia++)
+ {
+ real qi;
+
+ qi = q[sci+ia];
+#ifdef ENERGY_GROUPS
+ vctp[ia][((egps_i>>(ia*egps_ishift)) & egps_imask)*egps_jstride]
+#else
+ Vc[0]
+#endif
+ -= facel*qi*qi*Vc_sub_self;
+ }
+ }
+#endif
+
+#define gmx_load2_hpr(x) _mm256_insertf128_ps(gmx_load1_pr(x),gmx_load1_hpr(x+1),1)
+
+ /* Load i atom data */
+ sciy = scix + STRIDE;
+ sciz = sciy + STRIDE;
+ ix_SSE0 = gmx_add_pr(gmx_load2_hpr(x+scix) ,shX_SSE);
+ ix_SSE2 = gmx_add_pr(gmx_load2_hpr(x+scix+2),shX_SSE);
+ iy_SSE0 = gmx_add_pr(gmx_load2_hpr(x+sciy) ,shY_SSE);
+ iy_SSE2 = gmx_add_pr(gmx_load2_hpr(x+sciy+2),shY_SSE);
+ iz_SSE0 = gmx_add_pr(gmx_load2_hpr(x+sciz) ,shZ_SSE);
+ iz_SSE2 = gmx_add_pr(gmx_load2_hpr(x+sciz+2),shZ_SSE);
+
+ if (do_coul)
+ {
+ gmx_mm_pr facel_SSE;
+
+ facel_SSE = gmx_set1_pr(facel);
+
+ iq_SSE0 = gmx_mul_pr(facel_SSE,gmx_load2_hpr(q+sci));
+ iq_SSE2 = gmx_mul_pr(facel_SSE,gmx_load2_hpr(q+sci+2));
+ }
+
+#ifdef LJ_COMB_LB
+ hsig_i_SSE0 = gmx_load2_hpr(ljc+sci2+0);
+ hsig_i_SSE2 = gmx_load2_hpr(ljc+sci2+2);
+ seps_i_SSE0 = gmx_load2_hpr(ljc+sci2+STRIDE+0);
+ seps_i_SSE2 = gmx_load2_hpr(ljc+sci2+STRIDE+2);
+#else
+#ifdef LJ_COMB_GEOM
+ c6s_SSE0 = gmx_load2_hpr(ljc+sci2+0);
+ if (!half_LJ)
+ {
+ c6s_SSE2 = gmx_load2_hpr(ljc+sci2+2);
+ }
+ c12s_SSE0 = gmx_load2_hpr(ljc+sci2+STRIDE+0);
+ if (!half_LJ)
+ {
+ c12s_SSE2 = gmx_load2_hpr(ljc+sci2+STRIDE+2);
+ }
+#else
+ nbfp0 = nbfp_ptr + type[sci ]*nbat->ntype*nbfp_stride;
+ nbfp1 = nbfp_ptr + type[sci+1]*nbat->ntype*nbfp_stride;
+ if (!half_LJ)
+ {
+ nbfp2 = nbfp_ptr + type[sci+2]*nbat->ntype*nbfp_stride;
+ nbfp3 = nbfp_ptr + type[sci+3]*nbat->ntype*nbfp_stride;
+ }
+#endif
+#endif
+
+ /* Zero the potential energy for this list */
+ VvdwtotSSE = gmx_setzero_pr();
+ vctotSSE = gmx_setzero_pr();
+
+ /* Clear i atom forces */
+ fix_SSE0 = gmx_setzero_pr();
+ fix_SSE2 = gmx_setzero_pr();
+ fiy_SSE0 = gmx_setzero_pr();
+ fiy_SSE2 = gmx_setzero_pr();
+ fiz_SSE0 = gmx_setzero_pr();
+ fiz_SSE2 = gmx_setzero_pr();
+
+ cjind = cjind0;
+
+ /* Currently all kernels use (at least half) LJ */
+#define CALC_LJ
+ if (half_LJ)
+ {
+#define CALC_COULOMB
+#define HALF_LJ
+#define CHECK_EXCLS
+ while (cjind < cjind1 && nbl->cj[cjind].excl != SIMD_MASK_ALL)
+ {
+#include "nbnxn_kernel_simd_2xnn_inner.h"
+ cjind++;
+ }
+#undef CHECK_EXCLS
+ for(; (cjind<cjind1); cjind++)
+ {
+#include "nbnxn_kernel_simd_2xnn_inner.h"
+ }
+#undef HALF_LJ
+#undef CALC_COULOMB
+ }
+ else if (do_coul)
+ {
+#define CALC_COULOMB
+#define CHECK_EXCLS
+ while (cjind < cjind1 && nbl->cj[cjind].excl != SIMD_MASK_ALL)
+ {
+#include "nbnxn_kernel_simd_2xnn_inner.h"
+ cjind++;
+ }
+#undef CHECK_EXCLS
+ for(; (cjind<cjind1); cjind++)
+ {
+#include "nbnxn_kernel_simd_2xnn_inner.h"
+ }
+#undef CALC_COULOMB
+ }
+ else
+ {
+#define CHECK_EXCLS
+ while (cjind < cjind1 && nbl->cj[cjind].excl != SIMD_MASK_ALL)
+ {
+#include "nbnxn_kernel_simd_2xnn_inner.h"
+ cjind++;
+ }
+#undef CHECK_EXCLS
+ for(; (cjind<cjind1); cjind++)
+ {
+#include "nbnxn_kernel_simd_2xnn_inner.h"
+ }
+ }
+#undef CALC_LJ
+ ninner += cjind1 - cjind0;
+
+ /* Add accumulated i-forces to the force array */
+#if UNROLLJ >= 4
+#ifndef GMX_DOUBLE
+#define gmx_load_ps4 _mm_load_ps
+#define gmx_store_ps4 _mm_store_ps
+#define gmx_add_ps4 _mm_add_ps
+#else
+#define gmx_load_ps4 _mm256_load_pd
+#define gmx_store_ps4 _mm256_store_pd
+#define gmx_add_ps4 _mm256_add_pd
+#endif
+ GMX_MM_TRANSPOSE_SUM4H_PR(fix_SSE0,fix_SSE2,fix_SSE);
+ gmx_store_ps4(f+scix, gmx_add_ps4(fix_SSE, gmx_load_ps4(f+scix)));
+
+ GMX_MM_TRANSPOSE_SUM4H_PR(fiy_SSE0,fiy_SSE2,fiy_SSE);
+ gmx_store_ps4(f+sciy, gmx_add_ps4(fiy_SSE, gmx_load_ps4(f+sciy)));
+
+ GMX_MM_TRANSPOSE_SUM4H_PR(fiz_SSE0,fiz_SSE2,fiz_SSE);
+ gmx_store_ps4(f+sciz, gmx_add_ps4(fiz_SSE, gmx_load_ps4(f+sciz)));
+
+#ifdef CALC_SHIFTFORCES
+ gmx_store_ps4(shf,fix_SSE);
+ fshift[ish3+0] += SUM_SIMD4(shf);
+ gmx_store_ps4(shf,fiy_SSE);
+ fshift[ish3+1] += SUM_SIMD4(shf);
+ gmx_store_ps4(shf,fiz_SSE);
+ fshift[ish3+2] += SUM_SIMD4(shf);
+#endif
+#else
+ GMX_MM_TRANSPOSE_SUM2_PD(fix_SSE0,fix_SSE1,fix0_SSE);
+ _mm_store_pd(f+scix, _mm_add_pd(fix0_SSE, _mm_load_pd(f+scix)));
+ GMX_MM_TRANSPOSE_SUM2_PD(fix_SSE2,fix_SSE3,fix2_SSE);
+ _mm_store_pd(f+scix+2, _mm_add_pd(fix2_SSE, _mm_load_pd(f+scix+2)));
+
+ GMX_MM_TRANSPOSE_SUM2_PD(fiy_SSE0,fiy_SSE1,fiy0_SSE);
+ _mm_store_pd(f+sciy, _mm_add_pd(fiy0_SSE, _mm_load_pd(f+sciy)));
+ GMX_MM_TRANSPOSE_SUM2_PD(fiy_SSE2,fiy_SSE3,fiy2_SSE);
+ _mm_store_pd(f+sciy+2, _mm_add_pd(fiy2_SSE, _mm_load_pd(f+sciy+2)));
+
+ GMX_MM_TRANSPOSE_SUM2_PD(fiz_SSE0,fiz_SSE1,fiz0_SSE);
+ _mm_store_pd(f+sciz, _mm_add_pd(fiz0_SSE, _mm_load_pd(f+sciz)));
+ GMX_MM_TRANSPOSE_SUM2_PD(fiz_SSE2,fiz_SSE3,fiz2_SSE);
+ _mm_store_pd(f+sciz+2, _mm_add_pd(fiz2_SSE, _mm_load_pd(f+sciz+2)));
+
+#ifdef CALC_SHIFTFORCES
+ _mm_store_pd(shf,_mm_add_pd(fix0_SSE,fix2_SSE));
+ fshift[ish3+0] += shf[0] + shf[1];
+ _mm_store_pd(shf,_mm_add_pd(fiy0_SSE,fiy2_SSE));
+ fshift[ish3+1] += shf[0] + shf[1];
+ _mm_store_pd(shf,_mm_add_pd(fiz0_SSE,fiz2_SSE));
+ fshift[ish3+2] += shf[0] + shf[1];
+#endif
+#endif
+
+#ifdef CALC_ENERGIES
+ if (do_coul)
+ {
+ gmx_store_pr(tmpsum,vctotSSE);
+ *Vc += SUM_SIMD(tmpsum);
+ }
+
+ gmx_store_pr(tmpsum,VvdwtotSSE);
+ *Vvdw += SUM_SIMD(tmpsum);
+#endif
+
+ /* Outer loop uses 6 flops/iteration */
+ }
+
+#ifdef COUNT_PAIRS
+ printf("atom pairs %d\n",npair);
+#endif
+}
+
+#undef gmx_load2_hpr
+
+#undef gmx_load_ps4
+#undef gmx_store_ps4
+#undef gmx_store_ps4
+
+#undef CALC_SHIFTFORCES
+
+#undef UNROLLI
+#undef UNROLLJ
+#undef STRIDE
+#undef TAB_FDV0
+#undef NBFP_STRIDE
#include "../nbnxn_consts.h"
#include "nbnxn_kernel_common.h"
-#ifdef GMX_X86_AVX_256
+#ifdef GMX_NBNXN_SIMD_4XN
-#include "nbnxn_kernel_x86_simd256.h"
+#include "nbnxn_kernel_simd_4xn.h"
-/* Include all flavors of the 256-bit AVX kernel loops */
+/* Include all flavors of the SSE or AVX 4xN kernel loops */
+#if GMX_NBNXN_SIMD_BITWIDTH == 128
+#define GMX_MM128_HERE
+#else
+#if GMX_NBNXN_SIMD_BITWIDTH == 256
#define GMX_MM256_HERE
+#else
+#error "unsupported GMX_NBNXN_SIMD_BITWIDTH"
+#endif
+#endif
/* Analytical reaction-field kernels */
#define CALC_COUL_RF
-#include "nbnxn_kernel_x86_simd_includes.h"
+#include "nbnxn_kernel_simd_4xn_includes.h"
#undef CALC_COUL_RF
#define CALC_COUL_TAB
/* Single cut-off: rcoulomb = rvdw */
-#include "nbnxn_kernel_x86_simd_includes.h"
+#include "nbnxn_kernel_simd_4xn_includes.h"
/* Twin cut-off: rcoulomb >= rvdw */
#define VDW_CUTOFF_CHECK
-#include "nbnxn_kernel_x86_simd_includes.h"
+#include "nbnxn_kernel_simd_4xn_includes.h"
#undef VDW_CUTOFF_CHECK
#undef CALC_COUL_TAB
#define CALC_COUL_EWALD
/* Single cut-off: rcoulomb = rvdw */
-#include "nbnxn_kernel_x86_simd_includes.h"
+#include "nbnxn_kernel_simd_4xn_includes.h"
/* Twin cut-off: rcoulomb >= rvdw */
#define VDW_CUTOFF_CHECK
-#include "nbnxn_kernel_x86_simd_includes.h"
+#include "nbnxn_kernel_simd_4xn_includes.h"
#undef VDW_CUTOFF_CHECK
#undef CALC_COUL_EWALD
enum { coultRF, coultTAB, coultTAB_TWIN, coultEWALD, coultEWALD_TWIN, coultNR };
-#define NBK_FN(elec,ljcomb) nbnxn_kernel_x86_simd256_##elec##_comb_##ljcomb##_ener
+#define NBK_FN(elec,ljcomb) nbnxn_kernel_simd_4xn_##elec##_comb_##ljcomb##_ener
static p_nbk_func_ener p_nbk_ener[coultNR][ljcrNR] =
{ { NBK_FN(rf ,geom), NBK_FN(rf ,lb), NBK_FN(rf ,none) },
{ NBK_FN(tab ,geom), NBK_FN(tab ,lb), NBK_FN(tab ,none) },
{ NBK_FN(ewald_twin,geom), NBK_FN(ewald_twin,lb), NBK_FN(ewald_twin,none) } };
#undef NBK_FN
-#define NBK_FN(elec,ljcomb) nbnxn_kernel_x86_simd256_##elec##_comb_##ljcomb##_energrp
+#define NBK_FN(elec,ljcomb) nbnxn_kernel_simd_4xn_##elec##_comb_##ljcomb##_energrp
static p_nbk_func_ener p_nbk_energrp[coultNR][ljcrNR] =
{ { NBK_FN(rf ,geom), NBK_FN(rf ,lb), NBK_FN(rf ,none) },
{ NBK_FN(tab ,geom), NBK_FN(tab ,lb), NBK_FN(tab ,none) },
{ NBK_FN(ewald_twin,geom), NBK_FN(ewald_twin,lb), NBK_FN(ewald_twin,none) } };
#undef NBK_FN
-#define NBK_FN(elec,ljcomb) nbnxn_kernel_x86_simd256_##elec##_comb_##ljcomb##_noener
+#define NBK_FN(elec,ljcomb) nbnxn_kernel_simd_4xn_##elec##_comb_##ljcomb##_noener
static p_nbk_func_noener p_nbk_noener[coultNR][ljcrNR] =
{ { NBK_FN(rf ,geom), NBK_FN(rf ,lb), NBK_FN(rf ,none) },
{ NBK_FN(tab ,geom), NBK_FN(tab ,lb), NBK_FN(tab ,none) },
const real *VSvdw,const real *VSc,
real *Vvdw,real *Vc)
{
+ const int simd_width = GMX_SIMD_WIDTH_HERE;
+ const int unrollj_half = GMX_SIMD_WIDTH_HERE/2;
int ng_p2,i,j,j0,j1,c,s;
-#define SIMD_WIDTH (GMX_X86_SIMD_WIDTH_HERE)
-#define SIMD_WIDTH_HALF (GMX_X86_SIMD_WIDTH_HERE/2)
-
ng_p2 = (1<<ng_2log);
/* The size of the x86 SIMD energy group buffer array is:
- * ng*ng*ng_p2*SIMD_WIDTH_HALF*SIMD_WIDTH
+ * ng*ng*ng_p2*unrollj_half*simd_width
*/
for(i=0; i<ng; i++)
{
{
for(j0=0; j0<ng; j0++)
{
- c = ((i*ng + j1)*ng_p2 + j0)*SIMD_WIDTH_HALF*SIMD_WIDTH;
- for(s=0; s<SIMD_WIDTH_HALF; s++)
+ c = ((i*ng + j1)*ng_p2 + j0)*unrollj_half*simd_width;
+ for(s=0; s<unrollj_half; s++)
{
Vvdw[i*ng+j0] += VSvdw[c+0];
Vvdw[i*ng+j1] += VSvdw[c+1];
Vc [i*ng+j0] += VSc [c+0];
Vc [i*ng+j1] += VSc [c+1];
- c += SIMD_WIDTH + 2;
+ c += simd_width + 2;
}
}
}
}
}
-#endif /* GMX_X86_AVX_256 */
+#endif /* GMX_NBNXN_SIMD_4XN */
void
-nbnxn_kernel_x86_simd256(nbnxn_pairlist_set_t *nbl_list,
- const nbnxn_atomdata_t *nbat,
- const interaction_const_t *ic,
- int ewald_excl,
- rvec *shift_vec,
- int force_flags,
- int clearF,
- real *fshift,
- real *Vc,
- real *Vvdw)
-#ifdef GMX_X86_AVX_256
+nbnxn_kernel_simd_4xn(nbnxn_pairlist_set_t *nbl_list,
+ const nbnxn_atomdata_t *nbat,
+ const interaction_const_t *ic,
+ int ewald_excl,
+ rvec *shift_vec,
+ int force_flags,
+ int clearF,
+ real *fshift,
+ real *Vc,
+ real *Vvdw)
+#ifdef GMX_NBNXN_SIMD_4XN
{
int nnbl;
nbnxn_pairlist_t **nbl;
}
#else
{
- gmx_incons("nbnxn_kernel_x86_simd256 called while GROMACS was configured without AVX enabled");
+ gmx_incons("nbnxn_kernel_simd_4xn called while GROMACS was configured without 4xN SIMD kernels enabled");
}
#endif
* To help us fund GROMACS development, we humbly ask that you cite
* the research papers on the package. Check out http://www.gromacs.org.
*/
-#ifndef _nbnxn_kernel_x86_simd256_h
-#define _nbnxn_kernel_x86_simd256_h
+#ifndef _nbnxn_kernel_simd_4xn_h
+#define _nbnxn_kernel_simd_4xn_h
#include "typedefs.h"
extern "C" {
#endif
-/* Wrapper call for the non-bonded cluster vs cluster kernels */
+/* Wrapper call for the non-bonded cluster vs cluster kernels.
+ * These kernels determine 4xN cluster interactions for SIMD width N.
+ */
void
-nbnxn_kernel_x86_simd256(nbnxn_pairlist_set_t *nbl_list,
- const nbnxn_atomdata_t *nbat,
- const interaction_const_t *ic,
- int ewald_excl,
- rvec *shift_vec,
- int force_flags,
- int clearF,
- real *fshift,
- real *Vc,
- real *Vvdw);
+nbnxn_kernel_simd_4xn(nbnxn_pairlist_set_t *nbl_list,
+ const nbnxn_atomdata_t *nbat,
+ const interaction_const_t *ic,
+ int ewald_excl,
+ rvec *shift_vec,
+ int force_flags,
+ int clearF,
+ real *fshift,
+ real *Vc,
+ real *Vvdw);
#ifdef __cplusplus
}
/* Include the force+energy kernels */
#define CALC_ENERGIES
#define LJ_COMB_GEOM
-#include "nbnxn_kernel_x86_simd_outer.h"
+#include "nbnxn_kernel_simd_4xn_outer.h"
#undef LJ_COMB_GEOM
#define LJ_COMB_LB
-#include "nbnxn_kernel_x86_simd_outer.h"
+#include "nbnxn_kernel_simd_4xn_outer.h"
#undef LJ_COMB_LB
-#include "nbnxn_kernel_x86_simd_outer.h"
+#include "nbnxn_kernel_simd_4xn_outer.h"
#undef CALC_ENERGIES
/* Include the force+energygroups kernels */
#define CALC_ENERGIES
#define ENERGY_GROUPS
#define LJ_COMB_GEOM
-#include "nbnxn_kernel_x86_simd_outer.h"
+#include "nbnxn_kernel_simd_4xn_outer.h"
#undef LJ_COMB_GEOM
#define LJ_COMB_LB
-#include "nbnxn_kernel_x86_simd_outer.h"
+#include "nbnxn_kernel_simd_4xn_outer.h"
#undef LJ_COMB_LB
-#include "nbnxn_kernel_x86_simd_outer.h"
+#include "nbnxn_kernel_simd_4xn_outer.h"
#undef ENERGY_GROUPS
#undef CALC_ENERGIES
/* Include the force only kernels */
#define LJ_COMB_GEOM
-#include "nbnxn_kernel_x86_simd_outer.h"
+#include "nbnxn_kernel_simd_4xn_outer.h"
#undef LJ_COMB_GEOM
#define LJ_COMB_LB
-#include "nbnxn_kernel_x86_simd_outer.h"
+#include "nbnxn_kernel_simd_4xn_outer.h"
#undef LJ_COMB_LB
-#include "nbnxn_kernel_x86_simd_outer.h"
+#include "nbnxn_kernel_simd_4xn_outer.h"
* the research papers on the package. Check out http://www.gromacs.org.
*/
-/* This is the innermost loop contents for the n vs n atom
- * SSE2 single precision kernels.
+/* This is the innermost loop contents for the 4 x N atom SIMD kernel.
+ * This flavor of the kernel calculates interactions of 4 i-atoms
+ * with N j-atoms stored in N wide SIMD registers.
*/
gmx_mm_pr r_SSE1,rs_SSE1,rf_SSE1,frac_SSE1;
gmx_mm_pr r_SSE2,rs_SSE2,rf_SSE2,frac_SSE2;
gmx_mm_pr r_SSE3,rs_SSE3,rf_SSE3,frac_SSE3;
- /* Table index: rs converted to an int */
+ /* Table index: rs truncated to an int */
#if !(defined GMX_MM256_HERE && defined GMX_DOUBLE)
gmx_epi32 ti_SSE0,ti_SSE1,ti_SSE2,ti_SSE3;
#else
jxSSE = gmx_load_pr(x+ajx);
jySSE = gmx_load_pr(x+ajy);
jzSSE = gmx_load_pr(x+ajz);
-
+
/* Calculate distance */
dx_SSE0 = gmx_sub_pr(ix_SSE0,jxSSE);
dy_SSE0 = gmx_sub_pr(iy_SSE0,jySSE);
dx_SSE3 = gmx_sub_pr(ix_SSE3,jxSSE);
dy_SSE3 = gmx_sub_pr(iy_SSE3,jySSE);
dz_SSE3 = gmx_sub_pr(iz_SSE3,jzSSE);
-
+
/* rsq = dx*dx+dy*dy+dz*dz */
rsq_SSE0 = gmx_calc_rsq_pr(dx_SSE0,dy_SSE0,dz_SSE0);
rsq_SSE1 = gmx_calc_rsq_pr(dx_SSE1,dy_SSE1,dz_SSE1);
*/
/* GMX_MM128_HERE or GMX_MM256_HERE should be set before including this file */
-#include "gmx_x86_simd_macros.h"
+#include "gmx_simd_macros.h"
#define SUM_SIMD4(x) (x[0]+x[1]+x[2]+x[3])
#define UNROLLI NBNXN_CPU_CLUSTER_I_SIZE
-#define UNROLLJ GMX_X86_SIMD_WIDTH_HERE
+#define UNROLLJ GMX_SIMD_WIDTH_HERE
#if defined GMX_MM128_HERE || defined GMX_DOUBLE
#define STRIDE 4
#ifdef GMX_MM128_HERE
#ifndef GMX_DOUBLE
-/* SSE single precision 4x4 kernel */
+/* single precision 4x4 kernel */
#define SUM_SIMD(x) SUM_SIMD4(x)
#define TAB_FDV0
#else
-/* SSE double precision 4x2 kernel */
+/* double precision 4x2 kernel */
#define SUM_SIMD(x) (x[0]+x[1])
#endif
#endif
#ifdef GMX_MM256_HERE
#ifndef GMX_DOUBLE
-/* AVX single precision 4x8 kernel */
+/* single precision 4x8 kernel */
#define SUM_SIMD(x) (x[0]+x[1]+x[2]+x[3]+x[4]+x[5]+x[6]+x[7])
#define TAB_FDV0
#else
-/* AVX double precision 4x4 kernel */
+/* double precision 4x4 kernel */
#define SUM_SIMD(x) SUM_SIMD4(x)
#endif
#endif
#define SIMD_MASK_ALL 0xffffffff
-#include "nbnxn_kernel_x86_simd_utils.h"
+#include "nbnxn_kernel_simd_utils.h"
/* All functionality defines are set here, except for:
* CALC_ENERGIES, ENERGY_GROUPS which are defined before.
/* Assumes all LJ parameters are identical */
/* #define FIX_LJ_C */
-#define NBK_FUNC_NAME_C_LJC(b,s,c,ljc,e) b##_##s##_##c##_comb_##ljc##_##e
+/* The NBK_FUNC_NAME... macros below generate the whole zoo of kernels names
+ * with all combinations off electrostatics (coul), LJ combination rules (ljc)
+ * and energy calculations (ene), depending on the defines set.
+ */
+
+#define NBK_FUNC_NAME_C_LJC(base,coul,ljc,ene) base##_##coul##_comb_##ljc##_##ene
#if defined LJ_COMB_GEOM
-#define NBK_FUNC_NAME_C(b,s,c,e) NBK_FUNC_NAME_C_LJC(b,s,c,geom,e)
+#define NBK_FUNC_NAME_C(base,coul,ene) NBK_FUNC_NAME_C_LJC(base,coul,geom,ene)
#else
#if defined LJ_COMB_LB
-#define NBK_FUNC_NAME_C(b,s,c,e) NBK_FUNC_NAME_C_LJC(b,s,c,lb,e)
+#define NBK_FUNC_NAME_C(base,coul,ene) NBK_FUNC_NAME_C_LJC(base,coul,lb,ene)
#else
-#define NBK_FUNC_NAME_C(b,s,c,e) NBK_FUNC_NAME_C_LJC(b,s,c,none,e)
+#define NBK_FUNC_NAME_C(base,coul,ene) NBK_FUNC_NAME_C_LJC(base,coul,none,ene)
#endif
#endif
#ifdef CALC_COUL_RF
-#define NBK_FUNC_NAME(b,s,e) NBK_FUNC_NAME_C(b,s,rf,e)
+#define NBK_FUNC_NAME(base,ene) NBK_FUNC_NAME_C(base,rf,ene)
#endif
#ifdef CALC_COUL_TAB
#ifndef VDW_CUTOFF_CHECK
-#define NBK_FUNC_NAME(b,s,e) NBK_FUNC_NAME_C(b,s,tab,e)
+#define NBK_FUNC_NAME(base,ene) NBK_FUNC_NAME_C(base,tab,ene)
#else
-#define NBK_FUNC_NAME(b,s,e) NBK_FUNC_NAME_C(b,s,tab_twin,e)
+#define NBK_FUNC_NAME(base,ene) NBK_FUNC_NAME_C(base,tab_twin,ene)
#endif
#endif
#ifdef CALC_COUL_EWALD
#ifndef VDW_CUTOFF_CHECK
-#define NBK_FUNC_NAME(b,s,e) NBK_FUNC_NAME_C(b,s,ewald,e)
+#define NBK_FUNC_NAME(base,ene) NBK_FUNC_NAME_C(base,ewald,ene)
#else
-#define NBK_FUNC_NAME(b,s,e) NBK_FUNC_NAME_C(b,s,ewald_twin,e)
-#endif
-#endif
-
-#ifdef GMX_MM128_HERE
-#define NBK_FUNC_NAME_S128_OR_S256(b,e) NBK_FUNC_NAME(b,x86_simd128,e)
+#define NBK_FUNC_NAME(base,ene) NBK_FUNC_NAME_C(base,ewald_twin,ene)
#endif
-#ifdef GMX_MM256_HERE
-#define NBK_FUNC_NAME_S128_OR_S256(b,e) NBK_FUNC_NAME(b,x86_simd256,e)
#endif
static void
#ifndef CALC_ENERGIES
-NBK_FUNC_NAME_S128_OR_S256(nbnxn_kernel,noener)
+NBK_FUNC_NAME(nbnxn_kernel_simd_4xn,noener)
#else
#ifndef ENERGY_GROUPS
-NBK_FUNC_NAME_S128_OR_S256(nbnxn_kernel,ener)
+NBK_FUNC_NAME(nbnxn_kernel_simd_4xn,ener)
#else
-NBK_FUNC_NAME_S128_OR_S256(nbnxn_kernel,energrp)
+NBK_FUNC_NAME(nbnxn_kernel_simd_4xn,energrp)
#endif
#endif
#undef NBK_FUNC_NAME
int nbfp_stride;
int n,ci,ci_sh;
int ish,ish3;
- gmx_bool half_LJ,do_coul;
+ gmx_bool do_LJ,half_LJ,do_coul;
int sci,scix,sciy,sciz,sci2;
int cjind0,cjind1,cjind;
int ip,jp;
__m128d fix2_SSE,fiy2_SSE,fiz2_SSE;
#endif
-#ifndef GMX_MM256_HERE
+#ifdef GMX_MM128_HERE
#ifndef GMX_DOUBLE
__m128i mask0 = _mm_set_epi32( 0x0008, 0x0004, 0x0002, 0x0001 );
__m128i mask1 = _mm_set_epi32( 0x0080, 0x0040, 0x0020, 0x0010 );
__m128i mask2 = _mm_set_epi32( 0x0020, 0x0020, 0x0010, 0x0010 );
__m128i mask3 = _mm_set_epi32( 0x0080, 0x0080, 0x0040, 0x0040 );
#endif
-#else
+#endif
+#ifdef GMX_MM256_HERE
/* AVX: use floating point masks, as there are no integer instructions */
#ifndef GMX_DOUBLE
gmx_mm_pr mask0 = _mm256_castsi256_ps(_mm256_set_epi32( 0x0080, 0x0040, 0x0020, 0x0010, 0x0008, 0x0004, 0x0002, 0x0001 ));
#endif
#endif
-#ifndef GMX_MM256_HERE
+#ifdef GMX_MM128_HERE
#ifndef GMX_DOUBLE
__m128 diag_SSE0 = gmx_mm_castsi128_pr( _mm_set_epi32( 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000 ));
__m128 diag_SSE1 = gmx_mm_castsi128_pr( _mm_set_epi32( 0xffffffff, 0xffffffff, 0x00000000, 0x00000000 ));
__m128d diag1_SSE2 = gmx_mm_castsi128_pd( _mm_set_epi32( 0xffffffff, 0xffffffff, 0x00000000, 0x00000000 ));
__m128d diag1_SSE3 = gmx_mm_castsi128_pd( _mm_set_epi32( 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
#endif
-#else /* GMX_MM256_HERE */
+#endif
+#ifdef GMX_MM256_HERE
#ifndef GMX_DOUBLE
gmx_mm_pr diag0_SSE0 = _mm256_castsi256_ps( _mm256_set_epi32( 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000 ));
gmx_mm_pr diag0_SSE1 = _mm256_castsi256_ps( _mm256_set_epi32( 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000, 0x00000000 ));
#endif
#endif
-#ifndef GMX_MM256_HERE
+#ifdef GMX_MM128_HERE
__m128i zeroi_SSE = _mm_setzero_si128();
#endif
#ifdef GMX_X86_SSE4_1
egps_jshift = 2*nbat->neg_2log;
egps_jmask = (1<<egps_jshift) - 1;
egps_jstride = (UNROLLJ>>1)*UNROLLJ;
- /* Major division is over i-particles: divide nVS by 4 for i-stride */
+ /* Major division is over i-particle energy groups, determine the stride */
Vstride_i = nbat->nenergrp*(1<<nbat->neg_2log)*egps_jstride;
#endif
ish = (nbln->shift & NBNXN_CI_SHIFT);
ish3 = ish*3;
- cjind0 = nbln->cj_ind_start;
- cjind1 = nbln->cj_ind_end;
- /* Currently only works super-cells equal to sub-cells */
+ cjind0 = nbln->cj_ind_start;
+ cjind1 = nbln->cj_ind_end;
ci = nbln->ci;
ci_sh = (ish == CENTRAL ? ci : -1);
sci += (ci & 1)*(STRIDE>>1);
#endif
- half_LJ = (nbln->shift & NBNXN_CI_HALF_LJ(0));
+ /* We have 5 LJ/C combinations, but use only three inner loops,
+ * as the other combinations are unlikely and/or not much faster:
+ * inner half-LJ + C for half-LJ + C / no-LJ + C
+ * inner LJ + C for full-LJ + C
+ * inner LJ for full-LJ + no-C / half-LJ + no-C
+ */
+ do_LJ = (nbln->shift & NBNXN_CI_DO_LJ(0));
do_coul = (nbln->shift & NBNXN_CI_DO_COUL(0));
+ half_LJ = ((nbln->shift & NBNXN_CI_HALF_LJ(0)) || !do_LJ) && do_coul;
#ifdef ENERGY_GROUPS
egps_i = nbat->energrp[ci];
}
#endif
- /* Load i atom data */
+ /* Load i atom data */
sciy = scix + STRIDE;
sciz = sciy + STRIDE;
ix_SSE0 = gmx_add_pr(gmx_load1_pr(x+scix) ,shX_SSE);
iz_SSE2 = gmx_add_pr(gmx_load1_pr(x+sciz+2),shZ_SSE);
iz_SSE3 = gmx_add_pr(gmx_load1_pr(x+sciz+3),shZ_SSE);
- /* With half_LJ we currently always calculate Coulomb interactions */
- if (do_coul || half_LJ)
+ if (do_coul)
{
iq_SSE0 = gmx_set1_pr(facel*q[sci]);
iq_SSE1 = gmx_set1_pr(facel*q[sci+1]);
#define CHECK_EXCLS
while (cjind < cjind1 && nbl->cj[cjind].excl != SIMD_MASK_ALL)
{
-#include "nbnxn_kernel_x86_simd_inner.h"
+#include "nbnxn_kernel_simd_4xn_inner.h"
cjind++;
}
#undef CHECK_EXCLS
for(; (cjind<cjind1); cjind++)
{
-#include "nbnxn_kernel_x86_simd_inner.h"
+#include "nbnxn_kernel_simd_4xn_inner.h"
}
#undef HALF_LJ
#undef CALC_COULOMB
#define CHECK_EXCLS
while (cjind < cjind1 && nbl->cj[cjind].excl != SIMD_MASK_ALL)
{
-#include "nbnxn_kernel_x86_simd_inner.h"
+#include "nbnxn_kernel_simd_4xn_inner.h"
cjind++;
}
#undef CHECK_EXCLS
for(; (cjind<cjind1); cjind++)
{
-#include "nbnxn_kernel_x86_simd_inner.h"
+#include "nbnxn_kernel_simd_4xn_inner.h"
}
#undef CALC_COULOMB
}
#define CHECK_EXCLS
while (cjind < cjind1 && nbl->cj[cjind].excl != SIMD_MASK_ALL)
{
-#include "nbnxn_kernel_x86_simd_inner.h"
+#include "nbnxn_kernel_simd_4xn_inner.h"
cjind++;
}
#undef CHECK_EXCLS
for(; (cjind<cjind1); cjind++)
{
-#include "nbnxn_kernel_x86_simd_inner.h"
+#include "nbnxn_kernel_simd_4xn_inner.h"
}
}
#undef CALC_LJ
i_SSE1 = _mm256_hadd_ps(i_SSE0,i_SSE2); \
o_SSE = _mm_add_ps(_mm256_castps256_ps128(i_SSE1),_mm256_extractf128_ps(i_SSE1,1)); \
}
+#define GMX_MM_TRANSPOSE_SUM4H_PR(i_SSE0,i_SSE2,o_SSE) \
+{ \
+ i_SSE0 = _mm256_hadd_ps(i_SSE0,_mm256_setzero_ps()); \
+ i_SSE2 = _mm256_hadd_ps(i_SSE2,_mm256_setzero_ps()); \
+ i_SSE0 = _mm256_hadd_ps(i_SSE0,i_SSE2); \
+ i_SSE2 = _mm256_permute_ps(i_SSE0,0b10110001); \
+ o_SSE = _mm_add_ps(_mm256_castps256_ps128(i_SSE0),_mm256_extractf128_ps(i_SSE2,1)); \
+}
#else
#define GMX_MM_TRANSPOSE_SUM4_PR(i_SSE0,i_SSE1,i_SSE2,i_SSE3,o_SSE) \
{ \
GMX_2_MM_TO_M256(c12t_SSE[0],c12t_SSE[1],c12_SSE); \
}
+#define load_lj_pair_params2(nbfp,type,aj,c6_SSE,c12_SSE) \
+{ \
+ __m128 clj_SSE[2*UNROLLJ],c6t_SSE[2],c12t_SSE[2]; \
+ int p; \
+ \
+ for(p=0; p<2*UNROLLJ; p++) \
+ { \
+ /* Here we load 4 aligned floats, but we need just 2 */ \
+ clj_SSE[p] = _mm_load_ps(nbfp+type[aj+p]*NBFP_STRIDE); \
+ } \
+ GMX_MM_SHUFFLE_4_PS_FIL01_TO_2_PS(clj_SSE[0],clj_SSE[1],clj_SSE[2],clj_SSE[3],c6t_SSE[0],c12t_SSE[0]); \
+ GMX_MM_SHUFFLE_4_PS_FIL01_TO_2_PS(clj_SSE[4],clj_SSE[5],clj_SSE[6],clj_SSE[7],c6t_SSE[1],c12t_SSE[1]); \
+ \
+ GMX_2_MM_TO_M256(c6t_SSE[0],c6t_SSE[1],c6_SSE); \
+ GMX_2_MM_TO_M256(c12t_SSE[0],c12t_SSE[1],c12_SSE); \
+}
+
#endif
#if defined GMX_MM128_HERE && defined GMX_DOUBLE
/* Add energy register to possibly multiple terms in the energy array.
* This function is the same for SSE/AVX single/double.
*/
-static inline void add_ener_grp(gmx_mm_pr e_SSE,real *v,int *offset_jj)
+static inline void add_ener_grp(gmx_mm_pr e_SSE,real *v,const int *offset_jj)
{
int jj;
{
gmx_mm_pr v_SSE;
- v_SSE = gmx_load_pr(v+offset_jj[jj]+jj*UNROLLJ);
- gmx_store_pr(v+offset_jj[jj]+jj*UNROLLJ,gmx_add_pr(v_SSE,e_SSE));
+ v_SSE = gmx_load_pr(v+offset_jj[jj]+jj*GMX_SIMD_WIDTH_HERE);
+ gmx_store_pr(v+offset_jj[jj]+jj*GMX_SIMD_WIDTH_HERE,gmx_add_pr(v_SSE,e_SSE));
}
}
+#if defined GMX_X86_AVX_256 && GMX_SIMD_WIDTH_HERE == 8
+/* As add_ener_grp above, but for two groups of UNROLLJ/2 stored in
+ * a single SIMD register.
+ */
+static inline void add_ener_grp_halves(gmx_mm_pr e_SSE,
+ real *v0,real *v1,const int *offset_jj)
+{
+ gmx_mm_hpr e_SSE0,e_SSE1;
+ int jj;
+
+ e_SSE0 = _mm256_extractf128_ps(e_SSE,0);
+ e_SSE1 = _mm256_extractf128_ps(e_SSE,1);
+
+ for(jj=0; jj<(UNROLLJ/2); jj++)
+ {
+ gmx_mm_hpr v_SSE;
+
+ v_SSE = gmx_load_hpr(v0+offset_jj[jj]+jj*GMX_SIMD_WIDTH_HERE/2);
+ gmx_store_hpr(v0+offset_jj[jj]+jj*GMX_SIMD_WIDTH_HERE/2,gmx_add_hpr(v_SSE,e_SSE0));
+ }
+ for(jj=0; jj<(UNROLLJ/2); jj++)
+ {
+ gmx_mm_hpr v_SSE;
+
+ v_SSE = gmx_load_hpr(v1+offset_jj[jj]+jj*GMX_SIMD_WIDTH_HERE/2);
+ gmx_store_hpr(v1+offset_jj[jj]+jj*GMX_SIMD_WIDTH_HERE/2,gmx_add_hpr(v_SSE,e_SSE1));
+ }
+}
+#endif
+
#endif /* _nbnxn_kernel_sse_utils_h_ */
#ifndef GMX_DOUBLE
#define NBNXN_SEARCH_SSE_SINGLE
-#include "gmx_x86_simd_single.h"
-#else
-#include "gmx_x86_simd_double.h"
#endif
+/* Include basic SSE2 stuff */
+#include <emmintrin.h>
+
#if defined NBNXN_SEARCH_SSE_SINGLE && GPU_NSUBCELL == 8
#define NBNXN_8BB_SSE
#endif
#define STRIDE_8BB 4
#define STRIDE_8BB_2LOG 2
+#endif /* NBNXN_SEARCH_SSE */
+
+#ifdef GMX_NBNXN_SIMD
/* The functions below are macros as they are performance sensitive */
#define X_IND_CJ_J8(cj) ((cj)*STRIDE_P8)
/* The j-cluster size is matched to the SIMD width */
-#ifndef GMX_DOUBLE
-/* 128 bits can hold 4 floats */
-#define CI_TO_CJ_S128(ci) CI_TO_CJ_J4(ci)
-#define X_IND_CI_S128(ci) X_IND_CI_J4(ci)
-#define X_IND_CJ_S128(cj) X_IND_CJ_J4(cj)
-/* 256 bits can hold 8 floats */
-#define CI_TO_CJ_S256(ci) CI_TO_CJ_J8(ci)
-#define X_IND_CI_S256(ci) X_IND_CI_J8(ci)
-#define X_IND_CJ_S256(cj) X_IND_CJ_J8(cj)
+#if GMX_NBNXN_SIMD_BITWIDTH == 128
+#ifdef GMX_DOUBLE
+#define CI_TO_CJ_SIMD_4XN(ci) CI_TO_CJ_J2(ci)
+#define X_IND_CI_SIMD_4XN(ci) X_IND_CI_J2(ci)
+#define X_IND_CJ_SIMD_4XN(cj) X_IND_CJ_J2(cj)
#else
-/* 128 bits can hold 2 doubles */
-#define CI_TO_CJ_S128(ci) CI_TO_CJ_J2(ci)
-#define X_IND_CI_S128(ci) X_IND_CI_J2(ci)
-#define X_IND_CJ_S128(cj) X_IND_CJ_J2(cj)
-/* 256 bits can hold 4 doubles */
-#define CI_TO_CJ_S256(ci) CI_TO_CJ_J4(ci)
-#define X_IND_CI_S256(ci) X_IND_CI_J4(ci)
-#define X_IND_CJ_S256(cj) X_IND_CJ_J4(cj)
+#define CI_TO_CJ_SIMD_4XN(ci) CI_TO_CJ_J4(ci)
+#define X_IND_CI_SIMD_4XN(ci) X_IND_CI_J4(ci)
+#define X_IND_CJ_SIMD_4XN(cj) X_IND_CJ_J4(cj)
+#endif
+#else
+#if GMX_NBNXN_SIMD_BITWIDTH == 256
+#ifdef GMX_DOUBLE
+#define CI_TO_CJ_SIMD_4XN(ci) CI_TO_CJ_J4(ci)
+#define X_IND_CI_SIMD_4XN(ci) X_IND_CI_J4(ci)
+#define X_IND_CJ_SIMD_4XN(cj) X_IND_CJ_J4(cj)
+#else
+#define CI_TO_CJ_SIMD_4XN(ci) CI_TO_CJ_J8(ci)
+#define X_IND_CI_SIMD_4XN(ci) X_IND_CI_J8(ci)
+#define X_IND_CJ_SIMD_4XN(cj) X_IND_CJ_J8(cj)
+/* Half SIMD with j-cluster size */
+#define CI_TO_CJ_SIMD_2XNN(ci) CI_TO_CJ_J4(ci)
+#define X_IND_CI_SIMD_2XNN(ci) X_IND_CI_J4(ci)
+#define X_IND_CJ_SIMD_2XNN(cj) X_IND_CJ_J4(cj)
+#endif
+#else
+#error "unsupported GMX_NBNXN_SIMD_WIDTH"
+#endif
#endif
-#endif /* NBNXN_SEARCH_SSE */
+#endif /* GMX_NBNXN_SIMD */
/* Interaction masks for 4xN atom interactions.
{
switch (nb_kernel_type)
{
- case nbk4x4_PlainC:
- case nbk4xN_X86_SIMD128:
- case nbk4xN_X86_SIMD256:
+ case nbnxnk4x4_PlainC:
+ case nbnxnk4xN_SIMD_4xN:
+ case nbnxnk4xN_SIMD_2xNN:
return NBNXN_CPU_CLUSTER_I_SIZE;
- case nbk8x8x8_CUDA:
- case nbk8x8x8_PlainC:
+ case nbnxnk8x8x8_CUDA:
+ case nbnxnk8x8x8_PlainC:
/* The cluster size for super/sub lists is only set here.
* Any value should work for the pair-search and atomdata code.
* The kernels, of course, might require a particular value.
int nbnxn_kernel_to_cj_size(int nb_kernel_type)
{
+ int nbnxn_simd_width=0;
+ int cj_size=0;
+
+#ifdef GMX_NBNXN_SIMD
+ nbnxn_simd_width = GMX_NBNXN_SIMD_BITWIDTH/(sizeof(real)*8);
+#endif
+
switch (nb_kernel_type)
{
- case nbk4x4_PlainC:
- return NBNXN_CPU_CLUSTER_I_SIZE;
- case nbk4xN_X86_SIMD128:
- /* Number of reals that fit in SIMD (128 bits = 16 bytes) */
- return 16/sizeof(real);
- case nbk4xN_X86_SIMD256:
- /* Number of reals that fit in SIMD (256 bits = 32 bytes) */
- return 32/sizeof(real);
- case nbk8x8x8_CUDA:
- case nbk8x8x8_PlainC:
- return nbnxn_kernel_to_ci_size(nb_kernel_type);
+ case nbnxnk4x4_PlainC:
+ cj_size = NBNXN_CPU_CLUSTER_I_SIZE;
+ break;
+ case nbnxnk4xN_SIMD_4xN:
+ cj_size = nbnxn_simd_width;
+ break;
+ case nbnxnk4xN_SIMD_2xNN:
+ cj_size = nbnxn_simd_width/2;
+ break;
+ case nbnxnk8x8x8_CUDA:
+ case nbnxnk8x8x8_PlainC:
+ cj_size = nbnxn_kernel_to_ci_size(nb_kernel_type);
+ break;
default:
gmx_incons("unknown kernel type");
}
- return 0;
+ return cj_size;
}
static int ci_to_cj(int na_cj_2log,int ci)
gmx_bool nbnxn_kernel_pairlist_simple(int nb_kernel_type)
{
- if (nb_kernel_type == nbkNotSet)
+ if (nb_kernel_type == nbnxnkNotSet)
{
gmx_fatal(FARGS, "Non-bonded kernel type not set for Verlet-style pair-list.");
}
switch (nb_kernel_type)
{
- case nbk8x8x8_CUDA:
- case nbk8x8x8_PlainC:
+ case nbnxnk8x8x8_CUDA:
+ case nbnxnk8x8x8_PlainC:
return FALSE;
- case nbk4x4_PlainC:
- case nbk4xN_X86_SIMD128:
- case nbk4xN_X86_SIMD256:
+ case nbnxnk4x4_PlainC:
+ case nbnxnk4xN_SIMD_4xN:
+ case nbnxnk4xN_SIMD_2xNN:
return TRUE;
default:
snew(nbl->work,1);
#ifdef NBNXN_BBXXXX
- snew_aligned(nbl->work->bb_ci,GPU_NSUBCELL/STRIDE_8BB*NNBSBB_XXXX,16);
+ snew_aligned(nbl->work->bb_ci,GPU_NSUBCELL/STRIDE_8BB*NNBSBB_XXXX,32);
#else
- snew_aligned(nbl->work->bb_ci,GPU_NSUBCELL*NNBSBB_B,16);
-#endif
- snew_aligned(nbl->work->x_ci,NBNXN_NA_SC_MAX*DIM,16);
-#ifdef NBNXN_SEARCH_SSE
- snew_aligned(nbl->work->x_ci_x86_simd128,1,16);
-#ifdef GMX_X86_AVX_256
- snew_aligned(nbl->work->x_ci_x86_simd256,1,32);
+ snew_aligned(nbl->work->bb_ci,GPU_NSUBCELL*NNBSBB_B,32);
#endif
+ snew_aligned(nbl->work->x_ci,NBNXN_NA_SC_MAX*DIM,32);
+#ifdef GMX_NBNXN_SIMD
+ snew_aligned(nbl->work->x_ci_simd_4xn,1,32);
+ snew_aligned(nbl->work->x_ci_simd_2xnn,1,32);
#endif
- snew_aligned(nbl->work->d2,GPU_NSUBCELL,16);
+ snew_aligned(nbl->work->d2,GPU_NSUBCELL,32);
}
void nbnxn_init_pairlist_set(nbnxn_pairlist_set_t *nbl_list,
return (rdiag && ci == cj ? NBNXN_INT_MASK_DIAG : NBNXN_INT_MASK_ALL);
}
-#ifdef NBNXN_SEARCH_SSE
/* Returns a diagonal or off-diagonal interaction mask for SIMD128 lists */
static unsigned int get_imask_x86_simd128(gmx_bool rdiag,int ci,int cj)
{
#endif
}
-#ifdef GMX_X86_AVX_256
/* Returns a diagonal or off-diagonal interaction mask for SIMD256 lists */
static unsigned int get_imask_x86_simd256(gmx_bool rdiag,int ci,int cj)
{
return (rdiag && ci == cj*2 ? NBNXN_INT_MASK_DIAG_J8_0 :
(rdiag && ci == cj*2+1 ? NBNXN_INT_MASK_DIAG_J8_1 :
NBNXN_INT_MASK_ALL));
-#else /* cj-size = 2 */
+#else /* cj-size = 4 */
return (rdiag && ci == cj ? NBNXN_INT_MASK_DIAG : NBNXN_INT_MASK_ALL);
#endif
}
+
+#ifdef GMX_NBNXN_SIMD
+#if GMX_NBNXN_SIMD_BITWIDTH == 128
+#define get_imask_x86_simd_4xn get_imask_x86_simd128
+#else
+#if GMX_NBNXN_SIMD_BITWIDTH == 256
+#define get_imask_x86_simd_4xn get_imask_x86_simd256
+#define get_imask_x86_simd_2xnn get_imask_x86_simd128
+#else
+#error "unsupported GMX_NBNXN_SIMD_BITWIDTH"
+#endif
+#endif
#endif
-#endif /* NBNXN_SEARCH_SSE */
/* Plain C code for making a pair list of cell ci vs cell cjf-cjl.
* Checks bounding box distances and possibly atom pair distances.
}
}
-#ifdef NBNXN_SEARCH_SSE
-/* Include make_cluster_list_x86_simd128/256 */
-#define GMX_MM128_HERE
-#include "gmx_x86_simd_macros.h"
-#define STRIDE_S PACK_X4
-#include "nbnxn_search_x86_simd.h"
-#undef STRIDE_S
-#undef GMX_MM128_HERE
-#ifdef GMX_X86_AVX_256
-/* Include make_cluster_list_x86_simd128/256 */
-#define GMX_MM256_HERE
-#include "gmx_x86_simd_macros.h"
-#define STRIDE_S GMX_X86_SIMD_WIDTH_HERE
-#include "nbnxn_search_x86_simd.h"
-#undef STRIDE_S
-#undef GMX_MM256_HERE
+#ifdef GMX_NBNXN_SIMD_4XN
+#include "nbnxn_search_simd_4xn.h"
#endif
+#ifdef GMX_NBNXN_SIMD_2XNN
+#include "nbnxn_search_simd_2xnn.h"
#endif
/* Plain C or SSE code for making a pair list of super-cell sci vs scj.
{
sort_cj_excl(nbl->cj+nbl->ci[nbl->nci].cj_ind_start,jlen,nbl->work);
- if (nbl->ci[nbl->nci].shift & NBNXN_CI_HALF_LJ(0))
+ /* The counts below are used for non-bonded pair/flop counts
+ * and should therefore match the available kernel setups.
+ */
+ if (!(nbl->ci[nbl->nci].shift & NBNXN_CI_DO_COUL(0)))
{
- nbl->work->ncj_hlj += jlen;
+ nbl->work->ncj_noq += jlen;
}
- else if (!(nbl->ci[nbl->nci].shift & NBNXN_CI_DO_COUL(0)))
+ else if ((nbl->ci[nbl->nci].shift & NBNXN_CI_HALF_LJ(0)) ||
+ !(nbl->ci[nbl->nci].shift & NBNXN_CI_DO_LJ(0)))
{
- nbl->work->ncj_noq += jlen;
+ nbl->work->ncj_hlj += jlen;
}
nbl->nci++;
switch (nb_kernel_type)
{
- case nbk4x4_PlainC:
+ case nbnxnk4x4_PlainC:
check_subcell_list_space_simple(nbl,cl-cf+1);
make_cluster_list_simple(gridj,
rl2,rbb2,
&ndistc);
break;
-#ifdef NBNXN_SEARCH_SSE
- case nbk4xN_X86_SIMD128:
+#ifdef GMX_NBNXN_SIMD_4XN
+ case nbnxnk4xN_SIMD_4xN:
check_subcell_list_space_simple(nbl,ci_to_cj(na_cj_2log,cl-cf)+2);
- make_cluster_list_x86_simd128(gridj,
- nbl,ci,cf,cl,
- (gridi == gridj && shift == CENTRAL),
- nbat->x,
- rl2,rbb2,
- &ndistc);
+ make_cluster_list_simd_4xn(gridj,
+ nbl,ci,cf,cl,
+ (gridi == gridj && shift == CENTRAL),
+ nbat->x,
+ rl2,rbb2,
+ &ndistc);
break;
-#ifdef GMX_X86_AVX_256
- case nbk4xN_X86_SIMD256:
+#endif
+#ifdef GMX_NBNXN_SIMD_2XNN
+ case nbnxnk4xN_SIMD_2xNN:
check_subcell_list_space_simple(nbl,ci_to_cj(na_cj_2log,cl-cf)+2);
- make_cluster_list_x86_simd256(gridj,
- nbl,ci,cf,cl,
- (gridi == gridj && shift == CENTRAL),
- nbat->x,
- rl2,rbb2,
- &ndistc);
+ make_cluster_list_simd_2xnn(gridj,
+ nbl,ci,cf,cl,
+ (gridi == gridj && shift == CENTRAL),
+ nbat->x,
+ rl2,rbb2,
+ &ndistc);
break;
#endif
-#endif
- case nbk8x8x8_PlainC:
- case nbk8x8x8_CUDA:
+ case nbnxnk8x8x8_PlainC:
+ case nbnxnk8x8x8_CUDA:
check_subcell_list_space_supersub(nbl,cl-cf+1);
for(cj=cf; cj<=cl; cj++)
{
{
switch (nb_kernel_type)
{
-#ifdef NBNXN_SEARCH_SSE
- case nbk4xN_X86_SIMD128:
- nbs->icell_set_x = icell_set_x_x86_simd128;
- break;
-#ifdef GMX_X86_AVX_256
- case nbk4xN_X86_SIMD256:
- nbs->icell_set_x = icell_set_x_x86_simd256;
+#ifdef GMX_NBNXN_SIMD_4XN
+ case nbnxnk4xN_SIMD_4xN:
+ nbs->icell_set_x = icell_set_x_simd_4xn;
break;
#endif
+#ifdef GMX_NBNXN_SIMD_2XNN
+ case nbnxnk4xN_SIMD_2xNN:
+ nbs->icell_set_x = icell_set_x_simd_2xnn;
+ break;
#endif
default:
nbs->icell_set_x = icell_set_x_simple;
--- /dev/null
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2012, The GROMACS development team,
+ * check out http://www.gromacs.org for more information.
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+#if GMX_NBNXN_SIMD_BITWIDTH == 128
+#define GMX_MM128_HERE
+#else
+#if GMX_NBNXN_SIMD_BITWIDTH == 256
+#define GMX_MM256_HERE
+#else
+#error "unsupported GMX_NBNXN_SIMD_BITWIDTH"
+#endif
+#endif
+#include "gmx_simd_macros.h"
+
+#if GMX_SIMD_WIDTH_HERE >= 2*NBNXN_CPU_CLUSTER_I_SIZE
+#define STRIDE_S (GMX_SIMD_WIDTH_HERE/2)
+#else
+#define STRIDE_S NBNXN_CPU_CLUSTER_I_SIZE
+#endif
+
+static gmx_inline gmx_mm_pr gmx_load_hpr_hilo_pr(const real *a)
+{
+ gmx_mm_hpr a_SSE;
+
+ a_SSE = _mm_load_ps(a);
+
+ return gmx_2hpr_to_pr(a_SSE,a_SSE);
+}
+
+static gmx_inline gmx_mm_pr gmx_set_2real_shift_pr(const real *a,real shift)
+{
+ gmx_mm_hpr a0,a1;
+
+ a0 = _mm_set1_ps(a[0] + shift);
+ a1 = _mm_set1_ps(a[1] + shift);
+
+ return gmx_2hpr_to_pr(a1,a0);
+}
+
+/* Copies PBC shifted i-cell packed atom coordinates to working array */
+static gmx_inline void
+icell_set_x_simd_2xnn(int ci,
+ real shx,real shy,real shz,
+ int na_c,
+ int stride,const real *x,
+ nbnxn_list_work_t *work)
+{
+ int ia;
+ nbnxn_x_ci_simd_2xnn_t *x_ci;
+
+ x_ci = work->x_ci_simd_2xnn;
+
+ ia = X_IND_CI_SIMD_2XNN(ci);
+
+ x_ci->ix_SSE0 = gmx_set_2real_shift_pr(x + ia + 0*STRIDE_S + 0, shx);
+ x_ci->iy_SSE0 = gmx_set_2real_shift_pr(x + ia + 1*STRIDE_S + 0, shy);
+ x_ci->iz_SSE0 = gmx_set_2real_shift_pr(x + ia + 2*STRIDE_S + 0, shz);
+ x_ci->ix_SSE2 = gmx_set_2real_shift_pr(x + ia + 0*STRIDE_S + 2, shx);
+ x_ci->iy_SSE2 = gmx_set_2real_shift_pr(x + ia + 1*STRIDE_S + 2, shy);
+ x_ci->iz_SSE2 = gmx_set_2real_shift_pr(x + ia + 2*STRIDE_S + 2, shz);
+}
+
+/* SIMD code for making a pair list of cell ci vs cell cjf-cjl
+ * for coordinates in packed format.
+ * Checks bouding box distances and possibly atom pair distances.
+ * This is an accelerated version of make_cluster_list_simple.
+ */
+static gmx_inline void
+make_cluster_list_simd_2xnn(const nbnxn_grid_t *gridj,
+ nbnxn_pairlist_t *nbl,
+ int ci,int cjf,int cjl,
+ gmx_bool remove_sub_diag,
+ const real *x_j,
+ real rl2,float rbb2,
+ int *ndistc)
+{
+ const nbnxn_x_ci_simd_2xnn_t *work;
+ const float *bb_ci;
+
+ gmx_mm_pr jx_SSE,jy_SSE,jz_SSE;
+
+ gmx_mm_pr dx_SSE0,dy_SSE0,dz_SSE0;
+ gmx_mm_pr dx_SSE2,dy_SSE2,dz_SSE2;
+
+ gmx_mm_pr rsq_SSE0;
+ gmx_mm_pr rsq_SSE2;
+
+ gmx_mm_pr wco_SSE0;
+ gmx_mm_pr wco_SSE2;
+ gmx_mm_pr wco_any_SSE;
+
+ gmx_mm_pr rc2_SSE;
+
+ gmx_bool InRange;
+ float d2;
+ int xind_f,xind_l,cj;
+
+ cjf = CI_TO_CJ_SIMD_2XNN(cjf);
+ cjl = CI_TO_CJ_SIMD_2XNN(cjl+1) - 1;
+
+ work = nbl->work->x_ci_simd_2xnn;
+
+ bb_ci = nbl->work->bb_ci;
+
+ rc2_SSE = gmx_set1_pr(rl2);
+
+ InRange = FALSE;
+ while (!InRange && cjf <= cjl)
+ {
+ d2 = subc_bb_dist2_sse(4,0,bb_ci,cjf,gridj->bbj);
+ *ndistc += 2;
+
+ /* Check if the distance is within the distance where
+ * we use only the bounding box distance rbb,
+ * or within the cut-off and there is at least one atom pair
+ * within the cut-off.
+ */
+ if (d2 < rbb2)
+ {
+ InRange = TRUE;
+ }
+ else if (d2 < rl2)
+ {
+ xind_f = X_IND_CJ_SIMD_2XNN(CI_TO_CJ_SIMD_2XNN(gridj->cell0) + cjf);
+
+ jx_SSE = gmx_load_hpr_hilo_pr(x_j+xind_f+0*STRIDE_S);
+ jy_SSE = gmx_load_hpr_hilo_pr(x_j+xind_f+1*STRIDE_S);
+ jz_SSE = gmx_load_hpr_hilo_pr(x_j+xind_f+2*STRIDE_S);
+
+ /* Calculate distance */
+ dx_SSE0 = gmx_sub_pr(work->ix_SSE0,jx_SSE);
+ dy_SSE0 = gmx_sub_pr(work->iy_SSE0,jy_SSE);
+ dz_SSE0 = gmx_sub_pr(work->iz_SSE0,jz_SSE);
+ dx_SSE2 = gmx_sub_pr(work->ix_SSE2,jx_SSE);
+ dy_SSE2 = gmx_sub_pr(work->iy_SSE2,jy_SSE);
+ dz_SSE2 = gmx_sub_pr(work->iz_SSE2,jz_SSE);
+
+ /* rsq = dx*dx+dy*dy+dz*dz */
+ rsq_SSE0 = gmx_calc_rsq_pr(dx_SSE0,dy_SSE0,dz_SSE0);
+ rsq_SSE2 = gmx_calc_rsq_pr(dx_SSE2,dy_SSE2,dz_SSE2);
+
+ wco_SSE0 = gmx_cmplt_pr(rsq_SSE0,rc2_SSE);
+ wco_SSE2 = gmx_cmplt_pr(rsq_SSE2,rc2_SSE);
+
+ wco_any_SSE = gmx_or_pr(wco_SSE0,wco_SSE2);
+
+ InRange = gmx_movemask_pr(wco_any_SSE);
+
+ *ndistc += 2*GMX_SIMD_WIDTH_HERE;
+ }
+ if (!InRange)
+ {
+ cjf++;
+ }
+ }
+ if (!InRange)
+ {
+ return;
+ }
+
+ InRange = FALSE;
+ while (!InRange && cjl > cjf)
+ {
+ d2 = subc_bb_dist2_sse(4,0,bb_ci,cjl,gridj->bbj);
+ *ndistc += 2;
+
+ /* Check if the distance is within the distance where
+ * we use only the bounding box distance rbb,
+ * or within the cut-off and there is at least one atom pair
+ * within the cut-off.
+ */
+ if (d2 < rbb2)
+ {
+ InRange = TRUE;
+ }
+ else if (d2 < rl2)
+ {
+ xind_l = X_IND_CJ_SIMD_2XNN(CI_TO_CJ_SIMD_2XNN(gridj->cell0) + cjl);
+
+ jx_SSE = gmx_load_hpr_hilo_pr(x_j+xind_l+0*STRIDE_S);
+ jy_SSE = gmx_load_hpr_hilo_pr(x_j+xind_l+1*STRIDE_S);
+ jz_SSE = gmx_load_hpr_hilo_pr(x_j+xind_l+2*STRIDE_S);
+
+ /* Calculate distance */
+ dx_SSE0 = gmx_sub_pr(work->ix_SSE0,jx_SSE);
+ dy_SSE0 = gmx_sub_pr(work->iy_SSE0,jy_SSE);
+ dz_SSE0 = gmx_sub_pr(work->iz_SSE0,jz_SSE);
+ dx_SSE2 = gmx_sub_pr(work->ix_SSE2,jx_SSE);
+ dy_SSE2 = gmx_sub_pr(work->iy_SSE2,jy_SSE);
+ dz_SSE2 = gmx_sub_pr(work->iz_SSE2,jz_SSE);
+
+ /* rsq = dx*dx+dy*dy+dz*dz */
+ rsq_SSE0 = gmx_calc_rsq_pr(dx_SSE0,dy_SSE0,dz_SSE0);
+ rsq_SSE2 = gmx_calc_rsq_pr(dx_SSE2,dy_SSE2,dz_SSE2);
+
+ wco_SSE0 = gmx_cmplt_pr(rsq_SSE0,rc2_SSE);
+ wco_SSE2 = gmx_cmplt_pr(rsq_SSE2,rc2_SSE);
+
+ wco_any_SSE = gmx_or_pr(wco_SSE0,wco_SSE2);
+
+ InRange = gmx_movemask_pr(wco_any_SSE);
+
+ *ndistc += 2*GMX_SIMD_WIDTH_HERE;
+ }
+ if (!InRange)
+ {
+ cjl--;
+ }
+ }
+
+ if (cjf <= cjl)
+ {
+ for(cj=cjf; cj<=cjl; cj++)
+ {
+ /* Store cj and the interaction mask */
+ nbl->cj[nbl->ncj].cj = CI_TO_CJ_SIMD_2XNN(gridj->cell0) + cj;
+ nbl->cj[nbl->ncj].excl = get_imask_x86_simd_2xnn(remove_sub_diag,ci,cj);
+ nbl->ncj++;
+ }
+ /* Increase the closing index in i super-cell list */
+ nbl->ci[nbl->nci].cj_ind_end = nbl->ncj;
+ }
+}
+
+#undef STRIDE_S
+#undef GMX_MM128_HERE
+#undef GMX_MM256_HERE
* the research papers on the package. Check out http://www.gromacs.org.
*/
-/* GMX_MM128_HERE or GMX_MM256_HERE should be set before including this file.
- * gmx_sse_or_avh.h should be included before including this file.
- */
-
-/* Copies PBC shifted i-cell packed atom coordinates to working array */
-#ifdef GMX_MM128_HERE
-static void icell_set_x_x86_simd128
+#if GMX_NBNXN_SIMD_BITWIDTH == 128
+#define GMX_MM128_HERE
#else
-#ifdef GMX_MM256_HERE
-static void icell_set_x_x86_simd256
+#if GMX_NBNXN_SIMD_BITWIDTH == 256
+#define GMX_MM256_HERE
#else
-"error: GMX_MM128_HERE or GMX_MM256_HERE not defined"
+#error "unsupported GMX_NBNXN_SIMD_BITWIDTH"
#endif
#endif
- (int ci,
- real shx,real shy,real shz,
- int na_c,
- int stride,const real *x,
- nbnxn_list_work_t *work)
-{
- int ia;
-#ifdef GMX_MM128_HERE
- nbnxn_x_ci_x86_simd128_t *x_ci;
-
- x_ci = work->x_ci_x86_simd128;
+#include "gmx_simd_macros.h"
- ia = X_IND_CI_S128(ci);
+#if GMX_SIMD_WIDTH_HERE >= NBNXN_CPU_CLUSTER_I_SIZE
+#define STRIDE_S (GMX_SIMD_WIDTH_HERE)
#else
- nbnxn_x_ci_x86_simd256_t *x_ci;
+#define STRIDE_S NBNXN_CPU_CLUSTER_I_SIZE
+#endif
+
+/* Copies PBC shifted i-cell packed atom coordinates to working array */
+static gmx_inline void
+icell_set_x_simd_4xn(int ci,
+ real shx,real shy,real shz,
+ int na_c,
+ int stride,const real *x,
+ nbnxn_list_work_t *work)
+{
+ int ia;
+ nbnxn_x_ci_simd_4xn_t *x_ci;
- x_ci = work->x_ci_x86_simd256;
+ x_ci = work->x_ci_simd_4xn;
- ia = X_IND_CI_S256(ci);
-#endif
+ ia = X_IND_CI_SIMD_4XN(ci);
x_ci->ix_SSE0 = gmx_set1_pr(x[ia + 0*STRIDE_S ] + shx);
x_ci->iy_SSE0 = gmx_set1_pr(x[ia + 1*STRIDE_S ] + shy);
x_ci->iz_SSE3 = gmx_set1_pr(x[ia + 2*STRIDE_S + 3] + shz);
}
-/* SSE or AVX code for making a pair list of cell ci vs cell cjf-cjl
+/* SIMD code for making a pair list of cell ci vs cell cjf-cjl
* for coordinates in packed format.
* Checks bouding box distances and possibly atom pair distances.
* This is an accelerated version of make_cluster_list_simple.
*/
-#ifdef GMX_MM128_HERE
-static void make_cluster_list_x86_simd128
-#else
-#ifdef GMX_MM256_HERE
-static void make_cluster_list_x86_simd256
-#else
-"error: GMX_MM128_HERE or GMX_MM256_HERE not defined"
-#endif
-#endif
- (const nbnxn_grid_t *gridj,
- nbnxn_pairlist_t *nbl,
- int ci,int cjf,int cjl,
- gmx_bool remove_sub_diag,
- const real *x_j,
- real rl2,float rbb2,
- int *ndistc)
+static gmx_inline void
+make_cluster_list_simd_4xn(const nbnxn_grid_t *gridj,
+ nbnxn_pairlist_t *nbl,
+ int ci,int cjf,int cjl,
+ gmx_bool remove_sub_diag,
+ const real *x_j,
+ real rl2,float rbb2,
+ int *ndistc)
{
-#ifdef GMX_MM128_HERE
- const nbnxn_x_ci_x86_simd128_t *work;
-#else
- const nbnxn_x_ci_x86_simd256_t *work;
-#endif
-
+ const nbnxn_x_ci_simd_4xn_t *work;
const float *bb_ci;
gmx_mm_pr jx_SSE,jy_SSE,jz_SSE;
float d2;
int xind_f,xind_l,cj;
-#ifdef GMX_MM128_HERE
- cjf = CI_TO_CJ_S128(cjf);
- cjl = CI_TO_CJ_S128(cjl+1) - 1;
-
- work = nbl->work->x_ci_x86_simd128;
-#else
- cjf = CI_TO_CJ_S256(cjf);
- cjl = CI_TO_CJ_S256(cjl+1) - 1;
+ cjf = CI_TO_CJ_SIMD_4XN(cjf);
+ cjl = CI_TO_CJ_SIMD_4XN(cjl+1) - 1;
- work = nbl->work->x_ci_x86_simd256;
-#endif
+ work = nbl->work->x_ci_simd_4xn;
bb_ci = nbl->work->bb_ci;
}
else if (d2 < rl2)
{
-#ifdef GMX_MM128_HERE
- xind_f = X_IND_CJ_S128(CI_TO_CJ_S128(gridj->cell0) + cjf);
-#else
- xind_f = X_IND_CJ_S256(CI_TO_CJ_S256(gridj->cell0) + cjf);
-#endif
+ xind_f = X_IND_CJ_SIMD_4XN(CI_TO_CJ_SIMD_4XN(gridj->cell0) + cjf);
+
jx_SSE = gmx_load_pr(x_j+xind_f+0*STRIDE_S);
jy_SSE = gmx_load_pr(x_j+xind_f+1*STRIDE_S);
jz_SSE = gmx_load_pr(x_j+xind_f+2*STRIDE_S);
InRange = gmx_movemask_pr(wco_any_SSE);
- *ndistc += 4*GMX_X86_SIMD_WIDTH_HERE;
+ *ndistc += 4*GMX_SIMD_WIDTH_HERE;
}
if (!InRange)
{
}
else if (d2 < rl2)
{
-#ifdef GMX_MM128_HERE
- xind_l = X_IND_CJ_S128(CI_TO_CJ_S128(gridj->cell0) + cjl);
-#else
- xind_l = X_IND_CJ_S256(CI_TO_CJ_S256(gridj->cell0) + cjl);
-#endif
+ xind_l = X_IND_CJ_SIMD_4XN(CI_TO_CJ_SIMD_4XN(gridj->cell0) + cjl);
+
jx_SSE = gmx_load_pr(x_j+xind_l+0*STRIDE_S);
jy_SSE = gmx_load_pr(x_j+xind_l+1*STRIDE_S);
jz_SSE = gmx_load_pr(x_j+xind_l+2*STRIDE_S);
InRange = gmx_movemask_pr(wco_any_SSE);
- *ndistc += 4*GMX_X86_SIMD_WIDTH_HERE;
+ *ndistc += 4*GMX_SIMD_WIDTH_HERE;
}
if (!InRange)
{
for(cj=cjf; cj<=cjl; cj++)
{
/* Store cj and the interaction mask */
-#ifdef GMX_MM128_HERE
- nbl->cj[nbl->ncj].cj = CI_TO_CJ_S128(gridj->cell0) + cj;
- nbl->cj[nbl->ncj].excl = get_imask_x86_simd128(remove_sub_diag,ci,cj);
-#else
- nbl->cj[nbl->ncj].cj = CI_TO_CJ_S256(gridj->cell0) + cj;
- nbl->cj[nbl->ncj].excl = get_imask_x86_simd256(remove_sub_diag,ci,cj);
-#endif
+ nbl->cj[nbl->ncj].cj = CI_TO_CJ_SIMD_4XN(gridj->cell0) + cj;
+ nbl->cj[nbl->ncj].excl = get_imask_x86_simd_4xn(remove_sub_diag,ci,cj);
nbl->ncj++;
}
/* Increase the closing index in i super-cell list */
nbl->ci[nbl->nci].cj_ind_end = nbl->ncj;
}
}
+
+#undef STRIDE_S
+#undef GMX_MM128_HERE
+#undef GMX_MM256_HERE
#include "nbnxn_atomdata.h"
#include "nbnxn_search.h"
#include "nbnxn_kernels/nbnxn_kernel_ref.h"
-#include "nbnxn_kernels/nbnxn_kernel_x86_simd128.h"
-#include "nbnxn_kernels/nbnxn_kernel_x86_simd256.h"
+#include "nbnxn_kernels/nbnxn_kernel_simd_4xn.h"
+#include "nbnxn_kernels/nbnxn_kernel_simd_2xnn.h"
#include "nbnxn_kernels/nbnxn_kernel_gpu_ref.h"
#ifdef GMX_LIB_MPI
gmx_incons("Invalid cut-off scheme passed!");
}
- if (nbvg->kernel_type != nbk8x8x8_CUDA)
+ if (nbvg->kernel_type != nbnxnk8x8x8_CUDA)
{
wallcycle_sub_start(wcycle, ewcsNONBONDED);
}
switch (nbvg->kernel_type)
{
- case nbk4x4_PlainC:
+ case nbnxnk4x4_PlainC:
nbnxn_kernel_ref(&nbvg->nbl_lists,
nbvg->nbat, ic,
fr->shift_vec,
enerd->grpp.ener[egLJSR]);
break;
- case nbk4xN_X86_SIMD128:
- nbnxn_kernel_x86_simd128(&nbvg->nbl_lists,
- nbvg->nbat, ic,
- nbvg->ewald_excl,
- fr->shift_vec,
- flags,
- clearF,
- fr->fshift[0],
- enerd->grpp.ener[egCOULSR],
- fr->bBHAM ?
- enerd->grpp.ener[egBHAMSR] :
- enerd->grpp.ener[egLJSR]);
+ case nbnxnk4xN_SIMD_4xN:
+ nbnxn_kernel_simd_4xn(&nbvg->nbl_lists,
+ nbvg->nbat, ic,
+ nbvg->ewald_excl,
+ fr->shift_vec,
+ flags,
+ clearF,
+ fr->fshift[0],
+ enerd->grpp.ener[egCOULSR],
+ fr->bBHAM ?
+ enerd->grpp.ener[egBHAMSR] :
+ enerd->grpp.ener[egLJSR]);
break;
- case nbk4xN_X86_SIMD256:
- nbnxn_kernel_x86_simd256(&nbvg->nbl_lists,
- nbvg->nbat, ic,
- nbvg->ewald_excl,
- fr->shift_vec,
- flags,
- clearF,
- fr->fshift[0],
- enerd->grpp.ener[egCOULSR],
- fr->bBHAM ?
- enerd->grpp.ener[egBHAMSR] :
- enerd->grpp.ener[egLJSR]);
+ case nbnxnk4xN_SIMD_2xNN:
+ nbnxn_kernel_simd_2xnn(&nbvg->nbl_lists,
+ nbvg->nbat, ic,
+ nbvg->ewald_excl,
+ fr->shift_vec,
+ flags,
+ clearF,
+ fr->fshift[0],
+ enerd->grpp.ener[egCOULSR],
+ fr->bBHAM ?
+ enerd->grpp.ener[egBHAMSR] :
+ enerd->grpp.ener[egLJSR]);
break;
- case nbk8x8x8_CUDA:
+ case nbnxnk8x8x8_CUDA:
nbnxn_cuda_launch_kernel(fr->nbv->cu_nbv, nbvg->nbat, flags, ilocality);
break;
- case nbk8x8x8_PlainC:
+ case nbnxnk8x8x8_PlainC:
nbnxn_kernel_gpu_ref(nbvg->nbl_lists.nbl[0],
nbvg->nbat, ic,
fr->shift_vec,
gmx_incons("Invalid nonbonded kernel type passed!");
}
- if (nbvg->kernel_type != nbk8x8x8_CUDA)
+ if (nbvg->kernel_type != nbnxnk8x8x8_CUDA)
{
wallcycle_sub_stop(wcycle, ewcsNONBONDED);
}
bDoForces = (flags & GMX_FORCE_FORCES);
bSepLRF = (bDoLongRange && bDoForces && (flags & GMX_FORCE_SEPLRF));
bUseGPU = fr->nbv->bUseGPU;
- bUseOrEmulGPU = bUseGPU || (nbv->grp[0].kernel_type == nbk8x8x8_PlainC);
+ bUseOrEmulGPU = bUseGPU || (nbv->grp[0].kernel_type == nbnxnk8x8x8_PlainC);
if (bStateChanged)
{
wallcycle_sub_stop(wcycle,ewcsNBS_SEARCH_NONLOCAL);
- if (nbv->grp[eintNonlocal].kernel_type == nbk8x8x8_CUDA)
+ if (nbv->grp[eintNonlocal].kernel_type == nbnxnk8x8x8_CUDA)
{
/* initialize non-local pair-list on the GPU */
nbnxn_cuda_init_pairlist(nbv->cu_nbv,
* numbers per nx+1 data points. For performance reasons we want
* the table data to be aligned to 16-byte.
*/
- snew_aligned(table.data, 12*(nx+1)*sizeof(real),16);
+ snew_aligned(table.data, 12*(nx+1)*sizeof(real),32);
for(k=0; (k<etiNR); k++) {
if (tabsel[k] != etabUSER) {
* to do this :-)
*/
- snew_aligned(table.data,4*nx,16);
+ snew_aligned(table.data,4*nx,32);
init_table(out,nx,nx0,table.scale,&(td[0]),!bReadTab);
* to do this :-)
*/
- snew_aligned(table.data,4*nx,16);
+ snew_aligned(table.data,4*nx,32);
copy2table(table.n,0,4,td[0].x,td[0].v,td[0].f,1.0,table.data);
{
static const char *desc[] = {
"[TT]g_angle[tt] computes the angle distribution for a number of angles",
- "or dihedrals. This way you can check whether your simulation",
- "is correct. With option [TT]-ov[tt] you can plot the average angle of",
- "a group of angles as a function of time. With the [TT]-all[tt] option",
- "the first graph is the average, the rest are the individual angles.[PAR]",
+ "or dihedrals.[PAR]",
+ "With option [TT]-ov[tt], you can plot the average angle of",
+ "a group of angles as a function of time. With the [TT]-all[tt] option,",
+ "the first graph is the average and the rest are the individual angles.[PAR]",
"With the [TT]-of[tt] option, [TT]g_angle[tt] also calculates the fraction of trans",
"dihedrals (only for dihedrals) as function of time, but this is",
- "probably only fun for a selected few.[PAR]",
- "With option [TT]-oc[tt] a dihedral correlation function is calculated.[PAR]",
- "It should be noted that the index file should contain",
- "atom-triples for angles or atom-quadruplets for dihedrals.",
+ "probably only fun for a select few.[PAR]",
+ "With option [TT]-oc[tt], a dihedral correlation function is calculated.[PAR]",
+ "It should be noted that the index file must contain",
+ "atom triplets for angles or atom quadruplets for dihedrals.",
"If this is not the case, the program will crash.[PAR]",
- "With option [TT]-or[tt] a trajectory file is dumped containing cos and",
- "sin of selected dihedral angles which subsequently can be used as",
- "input for a PCA analysis using [TT]g_covar[tt].[PAR]",
+ "With option [TT]-or[tt], a trajectory file is dumped containing cos and",
+ "sin of selected dihedral angles, which subsequently can be used as",
+ "input for a principal components analysis using [TT]g_covar[tt].[PAR]",
"Option [TT]-ot[tt] plots when transitions occur between",
"dihedral rotamers of multiplicity 3 and [TT]-oh[tt]",
"records a histogram of the times between such transitions,",
/* Returns TRUE when "opt" is needed at launch time */
static gmx_bool is_launch_file(char *opt, gmx_bool bSet)
{
- /* Apart from the input .tpr we need all options that were set
+ /* Apart from the input .tpr and the error log we need all options that were set
* on the command line and that do not start with -b */
- if (0 == strncmp(opt,"-b", 2) || 0 == strncmp(opt,"-s", 2))
+ if (0 == strncmp(opt,"-b", 2) || 0 == strncmp(opt,"-s", 2) || 0 == strncmp(opt,"-err", 4))
+ {
return FALSE;
+ }
- if (bSet)
- return TRUE;
- else
- return FALSE;
+ return bSet;
}